def loadFeatures(self,ftfile): ### Loads features from given file '''Loads features from given file.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['','none']: return if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile,'r').readline(),delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in ['feature','%sstart' % pos,'%send' % pos,'description']: if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True) (mx,mtot,fx) = (0.0,len(ftdata),0) for mainkey in rje.sortKeys(ftdata): self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot)) mx += 100.0 (id,ft,start,end) = string.split(mainkey,delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc}) self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def readSLiMSearchOcc(self,motifs=[]): ### Reads SLiMSearch results into data dictionary '''Reads SLiMSearch results into data dictionary.''' try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!') occfile = '%s.csv' % self.info['ResFile'] delimit = rje.delimitFromExt(filename=occfile) data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',',')) self.dict['Occ'] = {} ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (mx,ox,otot) = (0,0.0,len(data)) for occ in data: self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0 #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom'])) if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue (motif,seq,start,end) = string.split(occ,delimit) if motif not in motifs: continue try: gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0] self.deBug('%s:%s' % (gene,self.ensGO(gene))) if not self.ensGO(gene): continue except: continue if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev') elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr') else: type = 'ELM' if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1 if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {} if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = [] self.dict['Occ'][motif][type][gene].append(data[occ]) self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def loadMutations(self): ### Inputs parsed mutations back into dictionaries '''Inputs parsed mutations back into dictionaries.''' try:### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease'] infile = 'omim_mutations.tdt' if not os.path.exists(infile): return False datadict = rje.dataDict(self,infile,headers[:2],headers,'\t') mx = len(datadict) ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dkey in datadict.keys()[0:]: data = datadict.pop(dkey) record = data['OMIM_ID'] subid = data['SubID'] gene = data['Gene'] mutation = '%s%s%s' % (data['WildAA'],data['Pos'],data['MutAA']) disease = data['Disease'] if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} self.dict['Mutations'][gene][subid] = (disease,mutation) self.log.printLog('\r#OMIM','Loaded %s OMIM mutations (%s genes).' % (rje.integerString(mx),rje.integerString(len(self.dict['Records'])))) return True except: self.log.errorLog(rje_zen.Zen().wisdom()) return False
def loadAlias(self, sourcefile): ### Loads Alias data ''' Loads Alias data. >> sourcefile:str = Source filename ''' try: ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if sourcefile.lower() in ['', 'none']: return if not os.path.exists(sourcefile): return self.log.errorLog('Alias file "%s" not found' % (sourcefile), printerror=False) data = rje.dataDict(self, sourcefile, datakeys=['Aliases'], lists=True) ### ~ [2] Parse out Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx, htot) = (0.0, len(data)) for id in data: self.log.printLog('\r#ALIAS', 'Processing %s: %.1f%%' % (sourcefile, hx / htot), newline=False, log=False) hx += 100.0 ## ~ [2a] Update self.dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for alist in data[id]['Aliases']: for alias in string.split(alist, ','): self.addAlias(id, alias) if id in self.dict['Aliases']: self.dict['Aliases'][id].sort() self.log.printLog( '\r#ALIAS', 'Processed %s: %s IDs with aliases' % (sourcefile, rje.integerString(len(self.dict['Aliases'])))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def loadFeatures(self, ftfile): ### Loads features from given file '''Loads features from given file.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['', 'none']: return if not os.path.exists(ftfile): return self.printLog('#ERR', 'Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in [ 'feature', '%sstart' % pos, '%send' % pos, 'description' ]: if h not in hmap: return self.printLog( '#ERR', 'No %s field detected in "%s" features file' % (h, ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self, ftfile, mainkeys, ['description'], delimit, headers, lists=True) (mx, mtot, fx) = (0.0, len(ftdata), 0) for mainkey in rje.sortKeys(ftdata): self.progLog( '\r#FT', 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot)) mx += 100.0 (id, ft, start, end) = string.split(mainkey, delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({ 'Type': ft, 'Start': int(start), 'End': int(end), 'Desc': desc }) self.printLog( '\r#FT', 'Loaded %s features for %s IDs from %s' % (rje.integerString(fx), rje.integerString(len(self.dict['Features'])), ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def pickupList(self): ### Generates Pickup List from file(s) #V1.0 '''Generates Pickup List from file(s).''' if not self.getStrLC('PickHead').lower(): return [] picklist = [] for out in self.list['OutList']: resfile = '%s.%s' % (self.baseFile(),out) try: pickdat = rje.dataDict(self,resfile,[self.getStr('PickHead')]) except: pickdat = {} picklist = picklist + pickdat.keys() return picklist
def readPELM(self): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc. '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.''' try:### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### data = rje.dataDict(self,self.info['PELM'],mainkeys=['acc','position']) seqdict = {} # Dictionary of Acc:Sequence ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdict = self.dict['PhosphoSites'] for dkey in data: ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (acc,pos) = string.split(dkey) pos = string.atoi(pos) if acc not in pdict: pdict[acc] = {} if pos not in pdict[acc]: pdict[acc][pos] = {} ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if acc not in seqdict: seqdict[acc] = data[dkey]['sequence'] elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog('#ERR','Warning. Sequence mismatch for %s' % acc) if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code'] elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog('#ERR','Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],pdict[acc][pos]['aa'])) if data[dkey]['code'] != seqdict[acc][(pos-1):pos]: self.log.printLog('#ERR','Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],seqdict[acc][pos-1:pos])) ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acclist = rje.sortKeys(seqdict) pelmuni = rje_uniprot.UniProt(self.log,self.cmd_list) # UniProt entry unidict = pelmuni.accDict(acclist) # Dictionary of {acc:UniProtEntry} pelmseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None']) # SeqList object ## ~ [3b] Add one sequence for each AccNum and update seqdict ~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!# for acc in acclist: #!# Make accdict of {acc:Seq} using unidict and seqlist #!# sequence = seqdict[acc] try: uni = unidict[string.split(acc,'-')[0]] desc = uni.obj['Sequence'].info['Description'] name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'],acc,desc) if sequence != uni.obj['Sequence'].info['Sequence']: self.log.printLog('#WARNING','Sequence mismatch for UniProt entry %s' % acc) except: self.log.errorLog('Problem with %s' % acc) name = '%s_UNK__%s' % (acc,acc) #!# Add sequences where UniProt missing #!# seqdict[acc] = pelmseq._addSeq(name,sequence) ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FilterSeq']: pelmseq.autoFilter() for acc in acclist: if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc) acclist = rje.sortKeys(seqdict) ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not os.path.exists(self.info['PELMFas']) or self.stat['Interactive'] < 0 or rje.yesNo('%s exists: overwrite?' % self.info['PELMFas']): pelmseq.saveFasta(seqfile=self.info['PELMFas']) self.obj['SeqList'] = pelmseq self.obj['UniProt'] = pelmuni except: self.log.errorLog('Problem during PhosphoSeq.readPELM')
def setup(self): ### Sets up headers and reads in existing data if present '''Sets up headers and reads in existing data if present.''' try: ### ~ Setup Basic Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #X#headers = ['Alias','Species','Symbol','HGNC','Entrez','UniProt','EnsEMBL','HPRD','OMIM','EnsLoci','Desc'] headers = ['Alias','Species'] + gc_headers # All other headers added from altsource list ### ~ Read in data from existing files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.readHGNC() if self.opt['Update'] and os.path.exists(self.info['CardOut']): self.list['AltSource'].append(self.info['CardOut']) for altsource in self.list['AltSource']: sourcefile = rje.makePath(altsource,True) if not os.path.exists(sourcefile): self.log.errorLog('Alternative source "%s" missing!' % sourcefile,printerror=False,quitchoice=True) continue update = rje.dataDict(self,sourcefile,getheaders=True,ignore=['#']) for h in update.pop('Headers'): if h not in headers: headers.append(h) self.log.printLog('#DATA','Read GeneCards data for %d genes.' % (len(update))) for gene in rje.sortKeys(update): # Each source will overwrite data from the file before ## ~ Convert to Upper Case for consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if gene != gene.upper() and gene.upper() in update: continue # Only use upper case one! elif gene != gene.upper(): update[gene.upper()] = update.pop(gene) gene = gene.upper() if gene == '!FAILED!': continue ## ~ Update main dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['Update'] and altsource == self.info['CardOut'] and gene not in self.list['Genes']: self.list['Genes'].append(gene) if gene in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][gene],update[gene]) else: self.dict['GeneCard'][gene] = update[gene] ## ~ Temp Debugging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if gene in self.list['TestGenes']: print gene print update[gene] self.deBug(self.dict['GeneCard'][gene]) ## ~ Check Aliases etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'Symbol' in self.dict['GeneCard'][gene]: self.dict['GeneCard'][gene]['Symbol'] = self.dict['GeneCard'][gene]['Symbol'].upper() if 'Symbol' in update[gene] and update[gene]['Symbol'] != '!FAILED!': symbol = update[gene]['Symbol'] if symbol in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][symbol],update[gene],overwrite=False,replaceblanks=True) else: self.dict['GeneCard'][symbol] = update[gene] self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])),newline=False,log=False) if len(string.split(gene)) > 1: print '!!!', gene, '!!!' ### ~ Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard']))) self.list['Headers'] = headers[0:] if self.opt['Update']: self.opt['Append'] = False #x#if 'TASP1' in self.dict['GeneCard']: self.deBug(self.dict['GeneCard']['TASP1']) #x#else: self.deBug(rje.sortKeys(self.dict['GeneCard'])) except: self.log.errorLog('Problem during GeneCards.setup()') raise
def setup(self,gtext=''): ### Main class setup method. gtext will over-ride input file. '''Main class setup method. gtext will over-ride input file.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['HTML'] = rje_html.HTML(self.log,self.cmd_list) ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('InFile'))) if self.getStr('OutFile').lower() in ['','none']: self.str['OutFile'] = '%s.html' % self.basefile() ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## interms = [] if gtext: delimit = self.getStr('TermSplit') if delimit.lower() == 'tab': delimit = '\t' if delimit.lower() == 'space': delimit = ' ' if delimit.lower() == 'comma': delimit = ',' if delimit.lower() == 'period (.)': delimit = '.' if delimit.lower() == 'colon': delimit = ':' glossary = {} for line in string.split(gtext,'\n'): splitline = string.split(line,delimit) if delimit == '.' and (splitline[-1] in ['',' ']): splitline = splitline[:-1] if not splitline: continue (term,definition) = (splitline[0],string.join(splitline[1:],delimit)) if term == 'Term' and not glossary: continue if term: glossary[term] = {'Definition':definition} interms.append(term) else: try: if not self.getBool('KeepOrder') and open(self.getStr('InFile'),'r').readline()[:4] == 'Term': glossary = rje.dataDict(self,self.getStr('InFile'),mainkeys=['Term'],datakeys=['Term','Definition']) else: return self.setup(open(self.getStr('InFile'),'r').read()) except: self.errorLog('Problem reading input as dataDict(). Will try as text.') return self.setup(open(self.getStr('InFile'),'r').read()) if self.list['Terms']: for term in glossary: if term not in self.list['Terms']: glossary.pop(term) elif self.getBool('KeepOrder'): self.list['Terms'] = interms else: self.list['Terms'] = rje.sortKeys(glossary) for term in glossary: glossary[term] = glossary[term]['Definition'] ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nested = {} for term in glossary: tdict = nested for word in string.split(term.lower()): if word not in tdict: tdict[word] = {} tdict = tdict[word] tdict['='] = glossary[term] self.dict['Glossary'] = nested return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def ppiDisMatrix(self): ### Converts PPI Table into distance matrix '''Converts PPI Table into distance matrix.''' try: ### Check File ### if not os.path.exists(self.info['PPITab']): self.log.errorLog('PPI Table file "%s" missing!' % self.info['PPITab'], printerror=False) return False ### Setup ### data = rje.dataDict(self, self.info['PPITab'], getheaders=True) headers = data.pop('Headers') ppidis = rje_dismatrix.DisMatrix(self.log, self.cmd_list) ppidis.opt['Symmetric'] = True ppidis.setInfo({ 'Name': '%s.ppi_dis.txt' % self.info['Basefile'], 'Type': 'PPI' }) ### Make DisMatrix ### for p1 in headers[1:]: ppidis.addDis(p1, p1, 0) for p2 in headers[headers.index(p1) + 1:]: ppi = 0 unique = 0 for i in data.keys(): try: v1 = int(data[i][p1]) except: v1 = data[i][p1] try: v2 = int(data[i][p2]) except: v2 = data[i][p2] if v1 or v2: ppi += 1 if not (v1 and v2): unique += 1 if self.opt['Scaled']: ppidis.addDis(p1, p2, float(unique) / float(ppi)) else: ppidis.addDis(p1, p2, unique) ### Output ### delimit = rje.getDelimit(self.cmd_list, default=',') ppidis.saveMatrix(headers[1:], ppidis.info['Name'], delimit) except: self.log.errorLog('Major problem with rje_ppi.ppiDisMatrix') return False
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = '' elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.' self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper()) if self.info['OccFile'].lower() not in ['','none']: self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile']) self.dict['OccData'] = {} occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos']) for key in rje.sortKeys(occdata): seq = occdata[key].pop('Seq') if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {} dataset = occdata[key].pop('Dataset') if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = [] self.dict['OccData'][seq][dataset].append(occdata[key]) self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData'])))) self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')]) ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower()) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list) seqdict = self.obj['SeqList'].seqNameDic() for name in rje.sortKeys(seqdict): if self.opt['OccOnly'] and not name in self.dict['OccData']: continue seq = seqdict[name] sequence = seq.getSequence(gaps=False) seq.dict['PlotStat'] = {} if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True) if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True) if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True) try: seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons') seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons') except: self.printLog('#CONS','No conservation stats for %s' % name) self.printLog('#STAT','PlotStats calculated for %s' % name) for stat in seq.dict['PlotStat']: if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat]) seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat]) self.printLog('#STAT','PlotStats converted for %s' % name) ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if name in self.dict['OccData']: for dataset in self.dict['OccData'][name]: ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum']) self.output(seq,ofile,self.dict['OccData'][name][dataset]) else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum'])) return except: self.errorLog(rje_zen.Zen().wisdom())
def pickupList( self ): ### Generates Pickup List from file(s) #V1.0 '''Generates Pickup List from file(s).''' if not self.getStrLC('PickHead').lower(): return [] picklist = [] for out in self.list['OutList']: resfile = '%s.%s' % (self.baseFile(), out) try: pickdat = rje.dataDict(self, resfile, [self.getStr('PickHead')]) except: pickdat = {} picklist = picklist + pickdat.keys() return picklist
def ppiDisMatrix(self): ### Converts PPI Table into distance matrix '''Converts PPI Table into distance matrix.''' try: ### Check File ### if not os.path.exists(self.info['PPITab']): self.log.errorLog('PPI Table file "%s" missing!' % self.info['PPITab'],printerror=False) return False ### Setup ### data = rje.dataDict(self,self.info['PPITab'],getheaders=True) headers = data.pop('Headers') ppidis = rje_dismatrix.DisMatrix(self.log,self.cmd_list) ppidis.opt['Symmetric'] = True ppidis.setInfo({'Name':'%s.ppi_dis.txt' % self.info['Basefile'],'Type':'PPI'}) ### Make DisMatrix ### for p1 in headers[1:]: ppidis.addDis(p1,p1,0) for p2 in headers[headers.index(p1)+1:]: ppi = 0 unique = 0 for i in data.keys(): try: v1 = int(data[i][p1]) except: v1 = data[i][p1] try: v2 = int(data[i][p2]) except: v2 = data[i][p2] if v1 or v2: ppi += 1 if not (v1 and v2): unique += 1 if self.opt['Scaled']: ppidis.addDis(p1,p2,float(unique)/float(ppi)) else: ppidis.addDis(p1,p2,unique) ### Output ### delimit = rje.getDelimit(self.cmd_list,default=',') ppidis.saveMatrix(headers[1:],ppidis.info['Name'],delimit) except: self.log.errorLog('Major problem with rje_ppi.ppiDisMatrix') return False
def readSLiMSearch(self): ### Reads SLiMSearch results into data dictionary '''Reads SLiMSearch results into data dictionary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumfile = '%s.summary.csv' % self.info['ResFile'] occfile = '%s.csv' % self.info['ResFile'] if not os.path.exists(sumfile): return self.errorLog('No Summary file "%s"!' % sumfile,printerror=False) if not os.path.exists(occfile): return self.errorLog('No Occurrence file "%s"!' % occfile,printerror=False) ### ~ [2] Read Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### esum = rje.dataDict(self,sumfile,mainkeys=['Motif'],datakeys='All',getheaders=False) occmotifs = [] # List of motifs with enough occurrences for motif in rje.sortKeys(esum): if string.atoi(esum[motif]['N_Occ']) < self.stat['MinOcc']: continue occmotifs.append(motif) ### ~ [3] Read Occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#MOTIF','%d motifs with N_Occ >= MinOcc (%d)' % (len(occmotifs),self.stat['MinOcc'])) self.readSLiMSearchOcc(occmotifs) except: self.log.errorLog(rje_zen.Zen().wisdom())
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt' self.progLog('\r#PPI','Loading pairwise data...') pairwise = rje.dataDict(self,ppipairwise,['Hub','Spoke'],['Spoke','SpokeSeq','Evidence']) gene2seq = {}; seq2gene = {} fullppi = {}; px = 0.0; ptot = len(pairwise); ppix = 0 for pair in rje.sortKeys(pairwise): self.progLog('\r#PPI','Processing full pairwise PPI: %.2f%%' % (px/ptot)); px += 100.0 [hub,spoke] = string.split(pair,'\t') if spoke not in gene2seq: sseq = pairwise[pair]['SpokeSeq'] gene2seq[spoke] = sseq; seq2gene[string.split(sseq,'__')[0]] = spoke if hub not in fullppi: fullppi[hub] = {} if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']; ppix += 1 self.printLog('\r#PPI','Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)),rje.integerString(ppix/2))) ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt' goodppi = self.loadFromFile(goodppifile,chomplines=True) self.dict['PPI'] = {} px = 0.0; ptot = len(fullppi); fppix = ppix; ppix = 0 for hub in fullppi: self.progLog('\r#PPI','Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px/ptot,rje.integerString(len(self.dict['PPI'])),rje.integerString(ppix))); px +=100.0 self.dict['PPI'][hub] = [] for spoke in fullppi[hub]: goodspoke = False for ptype in goodppi: if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True; break if goodspoke: self.dict['PPI'][hub].append(spoke); continue goodspoke = True for spoke2 in fullppi[hub]: if spoke2 in [hub,spoke]: continue if spoke2 in fullppi[spoke]: goodspoke = False; break if goodspoke: self.dict['PPI'][hub].append(spoke) ppix += len(self.dict['PPI'][hub]) if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub) self.printLog('\r#PPI','Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString(len(fullppi)),rje.integerString(len(self.dict['PPI'])),rje.integerString(fppix/2),rje.integerString(ppix/2))) ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas' scmd = ['accnr=F','seqnr=F','seqin=%s' % seqfile] + self.cmd_list + ['autoload=T'] seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,scmd) self.dict['SeqObj'] = seqlist.seqNameDic('Max') self.dict['Gene2Seq'] = gene2seq; self.dict['Seq2Gene'] = seq2gene return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def loadAlias(self,sourcefile): ### Loads Alias data ''' Loads Alias data. >> sourcefile:str = Source filename ''' try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if sourcefile.lower() in ['','none']: return if not os.path.exists(sourcefile): return self.log.errorLog('Alias file "%s" not found' % (sourcefile),printerror=False) data = rje.dataDict(self,sourcefile,datakeys=['Aliases'],lists=True) ### ~ [2] Parse out Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx,htot) = (0.0,len(data)) for id in data: self.log.printLog('\r#ALIAS','Processing %s: %.1f%%' % (sourcefile,hx/htot),newline=False,log=False) hx += 100.0 ## ~ [2a] Update self.dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for alist in data[id]['Aliases']: for alias in string.split(alist,','): self.addAlias(id,alias) if id in self.dict['Aliases']: self.dict['Aliases'][id].sort() self.log.printLog('\r#ALIAS','Processed %s: %s IDs with aliases' % (sourcefile,rje.integerString(len(self.dict['Aliases'])))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def mapEnsGO(self,spec='HUMAN',gokey='EnsGO',fixhead=True): ### Extracts EnsEMBL GO mapping data from a BioMart download '''Extracts EnsEMBL GO mapping data from a BioMart download.''' ### ~ [1] ~ Setup paths and files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if gokey not in self.dict: self.dict[gokey] = {} ensmap = [] for gtype in ['GO','GO.BP','GO.CC','GO.MF']: gfile = self.info['EnsGOPath'] + 'ens_%s.%s.tdt' % (spec,gtype) if os.path.exists(gfile): ensmap.append(gfile) if not ensmap: self.errorLog('EnsEMBL-GO mapping file (%s) missing' % self.info['EnsGOPath'],printerror=False) return False ### ~ [2] ~ Parse Gene-GO Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mainkeys = ['Ensembl Gene ID','GO ID'] for gfile in ensmap: if fixhead: headers = string.split(rje.chomp(open(gfile,'r').readlines()[0]),'\t') if 'Ensembl Gene ID' in headers: mainkeys = ['Ensembl Gene ID'] else: mainkeys = headers[:1] if 'GO Term Accession' in headers: mainkeys.append('GO Term Accession') elif 'GO Term Accession (bp)' in headers: mainkeys.append('GO Term Accession (bp)') elif 'GO Term Accession (mf)' in headers: mainkeys.append('GO Term Accession (mf)') elif 'GO Term Accession (cc)' in headers: mainkeys.append('GO Term Accession (cc)') elif 'GO ID' in headers: mainkeys.append('GO ID') else: mainkeys.append(headers[2]) self.printLog('#HEAD','%s' % (string.join(mainkeys,' / '))) self.progLog('\r#GO','Mapping EnsEMBL GO...') ensdata = rje.dataDict(self,gfile,mainkeys) (mx,mtot) = (0.0,len(ensdata)) obselete_go = [] for map in ensdata: self.progLog('\r#GO','Mapping EnsEMBL GO: %.2f%%' % (mx/mtot)); mx += 100.0 try: (gene,go) = string.split(map) except: continue # no GO! ## Update dictionaries ## if go[:3] == 'GO:': go = go[3:] if go in self.go(): self.addGeneGO(gene,go,gokey) elif go in self.dict['AltID']: for id in self.dict['AltID'][go]: self.addGeneGO(gene,id,gokey) elif go not in obselete_go: obselete_go.append(go) self.printLog('\r#GO','Mapping EnsEMBL GO from %s complete.' % os.path.basename(gfile))
def loadMutations( self): ### Inputs parsed mutations back into dictionaries '''Inputs parsed mutations back into dictionaries.''' try: ### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} headers = [ 'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease' ] infile = 'omim_mutations.tdt' if not os.path.exists(infile): return False datadict = rje.dataDict(self, infile, headers[:2], headers, '\t') mx = len(datadict) ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dkey in datadict.keys()[0:]: data = datadict.pop(dkey) record = data['OMIM_ID'] subid = data['SubID'] gene = data['Gene'] mutation = '%s%s%s' % (data['WildAA'], data['Pos'], data['MutAA']) disease = data['Disease'] if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} self.dict['Mutations'][gene][subid] = (disease, mutation) self.log.printLog( '\r#OMIM', 'Loaded %s OMIM mutations (%s genes).' % (rje.integerString(mx), rje.integerString(len(self.dict['Records'])))) return True except: self.log.errorLog(rje_zen.Zen().wisdom()) return False
def run(self): ### Main Run Method '''Main Run Method.''' try:### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['Force'] or not self.loadMutations(): self.parseOMIM() ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!# ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## import pingu pcmd = self.cmd_list + ['fulloutput=F'] ping = self.obj['Pingu'] = pingu.PINGU(self.log,pcmd) ping.run() ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not ping.obj['GeneCards']: return self.log.errorLog('Cannot map EnsLoci without GeneCards.', printerror=False) genecards = ping.obj['GeneCards'].dict['GeneCard'] # GeneCards dictionary ensloci = ping.getEnsLoci() # EnsLoci SeqList object (ping.obj['EnsLoci']) seqdict = ensloci.seqNameDic() if not seqdict: return self.log.errorLog('Failed to read in EnsLoci sequences.', printerror=False) ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fudge'] = {} ensback = {} # Dictionary of {EnsLoci name:OMIM gene} mutations = {} # Reorganised dictionary of {gene:{pos:Mutation}} for gene in rje.sortKeys(self.dict['Mutations']): try: seq = seqdict[genecards[gene]['EnsLoci']] except: self.log.printLog('#MAP','No EnsLoci protein mapped for %s' % gene) continue mutations[gene] = {} ensback[genecards[gene]['EnsLoci']] = gene mutpos = {} # Dictionary of {pos:AA} to map onto sequence for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease,mutation) = self.dict['Mutations'][gene][subid] (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation) mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()] mutations[gene][int(pos)] = self.dict['Mutations'][gene][subid] self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos) self.deBug(self.dict['Fudge']) ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### allslims = {} # Full dictionary of SLiMFinder results matching OMIM genes slimomim = [] # List of (gene,pos) overlapping with SLiMs outfile = 'rje_omim.slimfinder.tdt' dataheaders = string.split('Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',',') headers = ['Gene','OMIM','SubID','Mutation','Disease'] + dataheaders rje.delimitedFileOutput(self,outfile,headers,delimit='\t',rje_backup=True) for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'): # Potential SLiM slimdata = rje.dataDict(self,file,['Pattern','Hit','Pos','Match'],dataheaders,delimit=',') for occ in slimdata: if slimdata[occ]['Hit'] in ensback: # OMIM gene - possible overlap gene = ensback[slimdata[occ]['Hit']] (start,end) = (int(slimdata[occ]['Pos']),int(slimdata[occ]['EndPos'])) if gene not in allslims: allslims[gene] = {} allslims[gene][occ] = slimdata[occ] for mpos in mutations[gene]: if start <= (mpos + self.dict['Fudge'][seqdict[genecards[gene]['EnsLoci']]]) <= end: self.log.printLog('#OMIMSLIM','%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'],slimdata[occ]['Hit'],slimdata[occ]['Pattern'],start,end,mutations[gene][mpos])) slimdata[occ]['Gene'] = gene slimdata[occ]['OMIM'] = string.join(self.dict['Records'][gene]) slimdata[occ]['Mutation'] = mutations[gene][mpos][1] slimdata[occ]['Disease'] = mutations[gene][mpos][0] rje.delimitedFileOutput(self,outfile,headers,'\t',slimdata[occ]) if (gene,mpos) not in slimomim: slimomim.append((gene,mpos)) ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (inslim,resx,mutx) = (0,0,0) # No. of residues in SLiMs, total residue count + no. mutations that may overlap for gene in mutations: # These are just the genes that mapped to sequences mutx += len(mutations[gene]) resx += seqdict[genecards[gene]['EnsLoci']].aaLen() if gene in allslims: # Partially covered by SLiMs res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen() for occ in allslims[gene]: (start,end) = (int(allslims[gene][occ]['Pos'])-1,int(allslims[gene][occ]['EndPos'])) res = res[:start] + [1] * (end-start) + res[end-1:] self.deBug('%s %d (%d)' % (gene,sum(res),seqdict[genecards[gene]['EnsLoci']].aaLen())) inslim += sum(res) self.log.printLog('#COV','SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0*inslim/resx)) self.log.printLog('#MUT','%d mutations that could potentially occur in SLiMs' % mutx) self.log.printLog('#PROB','Probability of observed %d mutation overlap = %.4f' % (len(slimomim),rje.binomial(len(slimomim),mutx,float(inslim)/resx,callobj=self))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def run(self): ### Main Run Method '''Main Run Method.''' try: ### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['Force'] or not self.loadMutations(): self.parseOMIM() ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!# ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## import pingu pcmd = self.cmd_list + ['fulloutput=F'] ping = self.obj['Pingu'] = pingu.PINGU(self.log, pcmd) ping.run() ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not ping.obj['GeneCards']: return self.log.errorLog( 'Cannot map EnsLoci without GeneCards.', printerror=False) genecards = ping.obj['GeneCards'].dict[ 'GeneCard'] # GeneCards dictionary ensloci = ping.getEnsLoci( ) # EnsLoci SeqList object (ping.obj['EnsLoci']) seqdict = ensloci.seqNameDic() if not seqdict: return self.log.errorLog( 'Failed to read in EnsLoci sequences.', printerror=False) ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fudge'] = {} ensback = {} # Dictionary of {EnsLoci name:OMIM gene} mutations = {} # Reorganised dictionary of {gene:{pos:Mutation}} for gene in rje.sortKeys(self.dict['Mutations']): try: seq = seqdict[genecards[gene]['EnsLoci']] except: self.log.printLog( '#MAP', 'No EnsLoci protein mapped for %s' % gene) continue mutations[gene] = {} ensback[genecards[gene]['EnsLoci']] = gene mutpos = {} # Dictionary of {pos:AA} to map onto sequence for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease, mutation) = self.dict['Mutations'][gene][subid] (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)', mutation) mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()] mutations[gene][int( pos)] = self.dict['Mutations'][gene][subid] self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos) self.deBug(self.dict['Fudge']) ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### allslims = { } # Full dictionary of SLiMFinder results matching OMIM genes slimomim = [] # List of (gene,pos) overlapping with SLiMs outfile = 'rje_omim.slimfinder.tdt' dataheaders = string.split( 'Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign', ',') headers = ['Gene', 'OMIM', 'SubID', 'Mutation', 'Disease' ] + dataheaders rje.delimitedFileOutput(self, outfile, headers, delimit='\t', rje_backup=True) for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'): # Potential SLiM slimdata = rje.dataDict(self, file, ['Pattern', 'Hit', 'Pos', 'Match'], dataheaders, delimit=',') for occ in slimdata: if slimdata[occ][ 'Hit'] in ensback: # OMIM gene - possible overlap gene = ensback[slimdata[occ]['Hit']] (start, end) = (int(slimdata[occ]['Pos']), int(slimdata[occ]['EndPos'])) if gene not in allslims: allslims[gene] = {} allslims[gene][occ] = slimdata[occ] for mpos in mutations[gene]: if start <= (mpos + self.dict['Fudge'][seqdict[ genecards[gene]['EnsLoci']]]) <= end: self.log.printLog( '#OMIMSLIM', '%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'], slimdata[occ]['Hit'], slimdata[occ]['Pattern'], start, end, mutations[gene][mpos])) slimdata[occ]['Gene'] = gene slimdata[occ]['OMIM'] = string.join( self.dict['Records'][gene]) slimdata[occ]['Mutation'] = mutations[gene][ mpos][1] slimdata[occ]['Disease'] = mutations[gene][ mpos][0] rje.delimitedFileOutput( self, outfile, headers, '\t', slimdata[occ]) if (gene, mpos) not in slimomim: slimomim.append((gene, mpos)) ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (inslim, resx, mutx) = ( 0, 0, 0 ) # No. of residues in SLiMs, total residue count + no. mutations that may overlap for gene in mutations: # These are just the genes that mapped to sequences mutx += len(mutations[gene]) resx += seqdict[genecards[gene]['EnsLoci']].aaLen() if gene in allslims: # Partially covered by SLiMs res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen() for occ in allslims[gene]: (start, end) = (int(allslims[gene][occ]['Pos']) - 1, int(allslims[gene][occ]['EndPos'])) res = res[:start] + [1] * (end - start) + res[end - 1:] self.deBug('%s %d (%d)' % (gene, sum(res), seqdict[genecards[gene]['EnsLoci']].aaLen())) inslim += sum(res) self.log.printLog( '#COV', 'SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0 * inslim / resx)) self.log.printLog( '#MUT', '%d mutations that could potentially occur in SLiMs' % mutx) self.log.printLog( '#PROB', 'Probability of observed %d mutation overlap = %.4f' % (len(slimomim), rje.binomial( len(slimomim), mutx, float(inslim) / resx, callobj=self))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def mapRegionsToSequences( self): ### Maps tabulates PPI regions onto sequence datasets '''Maps tabulates PPI regions onto sequence datasets.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### minseq = 3 outdir = 'RegPPI/' adddir = 'RegPPIAdd/' rje.mkDir(self, outdir) rje.mkDir(self, adddir) tabfile = 'ppi_region.tdt' region = rje.dataDict(self, tabfile, ['Interactor', 'Protein'], ['Start', 'End'], lists=True) ### ~ [2] Work through each pair in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### px = 0.0 ptot = len(region) fx = 0 for pair in rje.sortKeys(region): self.progLog('\r#FAS', 'Generating fasta files: %.2f%%' % (px / ptot)) px += 100.0 ## ~ [2a] Map sequences to PPI dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## [hub, spoke] = string.split(pair, '\t') try: qryseq = self.dict['SeqObj'][spoke] except: self.printLog( '\n#QRY', 'Spoke gene "%s" missing from Sequence file' % spoke) continue try: spoke = self.dict['Seq2Gene'][spoke] except: self.printLog( '\n#QRY', 'Spoke protein "%s" missing from PPI dictionary' % spoke) continue if hub not in self.dict['PPI']: self.printLog( '\n#HUB', 'Hub gene "%s" missing from PPI dictionary' % hub) continue addspoke = spoke not in self.dict['PPI'][hub] if addspoke: self.dict['PPI'][hub].append(spoke) self.printLog( '\n#PPI', 'Added spoke gene "%s" to hub "%s" interactome' % (spoke, hub)) if len(self.dict['PPI'][hub]) < minseq: self.printLog( '\n#HUB', 'Hub "%s" interactome too small (<%s spokes)' % (hub, minseq)) continue ## ~ [2b] Identify query sequence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## reglist = [] for pos in region[pair]['Start'] + region[pair]['End']: reglist.append(string.atoi(pos)) reglist.sort() qsequence = qryseq.info['Sequence'].lower() self.deBug(len(qsequence)) self.deBug(qsequence) prelen = len(qsequence) while reglist: self.deBug(reglist) try: startx = reglist.pop(0) - 1 endx = reglist.pop(0) except: self.errorLog('%s PPI Region problem: %s' % (pair, region[pair])) continue self.deBug(qsequence[startx - 1:endx + 1].upper()) qsequence = qsequence[:startx] + qsequence[ startx:endx].upper() + qsequence[endx:] self.deBug(qsequence) if len(qsequence) != prelen: self.printLog('#F**K', '%s' % region[pair]) self.printLog('#F**K', qryseq.info['Sequence'].lower()) self.printLog('#F**K', qsequence) raise ValueError ## ~ [2c] Output sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if addspoke: outfile = '%s%s.%s.fas' % (adddir, hub, spoke) ox = 1 else: outfile = '%s%s.%s.fas' % (outdir, hub, spoke) ox = 1 open(outfile, 'w').write('>%s\n%s\n' % (qryseq.info['Name'], qsequence)) for spoke2 in self.dict['PPI'][hub]: if spoke2 == spoke: continue try: sseq = self.dict['SeqObj'][self.dict['Gene2Seq'] [spoke2]] open(outfile, 'a').write( '>%s\n%s\n' % (sseq.info['Name'], sseq.info['Sequence'])) ox += 1 except: pass self.printLog('\n#FAS', '%s sequences output to %s' % (ox, outfile)) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def setup(self): ### Loads data into attributes. '''Loads data into attributes.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = self.obj['UniProt'] = rje_uniprot.UniProt( self.log, self.cmd_list) uniprot.readUniProt() if uniprot.entryNum( ) > 0: ### UniProt data loaded. Populate seqlist and domain dictionary. seqlist = rje_seq.SeqList(self.log, self.cmd_list + ['autoload=F']) for entry in uniprot.list['Entry']: seq = entry.obj['Sequence'] seqlist.seq.append(entry.obj['Sequence']) name = seq.shortName() self.dict['Entry'][name] = entry self.dict['Seq'][name] = seq for ft in entry.list['Feature']: if ft['Type'] in self.list['DomFT']: try: dom = string.split(ft['Desc'])[0] if dom not in self.dict['Domain']: self.dict['Domain'][dom] = [] if name not in self.dict['Domain'][dom]: self.dict['Domain'][dom].append(name) except: self.errorLog('Trouble with %s feature %s' % (name, ft)) ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: seqlist = rje_seq.SeqList(self.log, self.cmd_list) for seq in seqlist.seq: name = seq.shortName() self.dict['Entry'][name] = None self.dict['Seq'][name] = seq #!# Consider adding loading domains from a table #!# ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['PPI'] # Dictionary of ShortName-centred ppi = rje.dataDict(self, self.info['PPI']) for hub in ppi: if ppi[hub]['EnsLoci'] == '-': continue ens = ppi[hub]['EnsLoci'] if ens not in self.dict['PPI']: self.dict['PPI'][ens] = [] self.dict['Gene'][ens] = hub for gene in string.split(ppi[hub]['PPI'], ','): if ppi[gene]['EnsLoci'] == '-': continue if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]: self.dict['PPI'][ens].append(ppi[gene]['EnsLoci']) ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['DDI'] = {} if self.info['DDI'].lower() not in ['', 'none']: data = rje.dataDict(self, self.info['DDI'], mainkeys=['Name1'], datakeys=['Name2'], headers=[ 'Pfam1', 'Pfam2', 'Name1', 'Name2', 'Acc1', 'Acc2', 'Code1', 'Code2' ], lists=True) ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # (dx, dtot) = (0.0, len(data)) self.deBug(data) try: rje.sortKeys(data) except: self.errorLog('F**k', quitchoice=True) for p1 in rje.sortKeys(data): self.progLog( '\r#DDI', 'Parsing DDI from iPFam: %.1f%%' % (dx / dtot)) if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = [] for p2 in data[p1]['Name2']: if p2 not in self.dict['DDI']: self.dict['DDI'][p2] = [] if p2 not in self.dict['DDI'][p1]: self.dict['DDI'][p1].append(p2) if p1 not in self.dict['DDI'][p2]: self.dict['DDI'][p2].append(p1) self.printLog( '\r#DDI', 'Parsing DDI from iPFam: %s domains' % (rje.integerString(dtot))) ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fam'] = {} if self.info['Fam'].lower() not in ['', 'none']: data = rje.dataDict(self, self.info['Fam'], mainkeys=['Qry'], datakeys=['Hit'], lists=True) for qry in self.dict['Seq']: self.dict['Fam'][qry] = [] if qry in data: self.dict['Fam'][qry] = data[qry]['Hit'] elif self.dict['Seq'][qry].info['AccNum'] in data: self.dict['Fam'][qry] = data[ self.dict['Seq'][qry].info['AccNum']]['Hit'] if qry not in self.dict['Fam'][qry]: self.dict['Fam'][qry].append(qry) except: self.errorLog('Problem with SLiMPID.setup()', quitchoice=True)
def mapRegionsToSequences(self): ### Maps tabulates PPI regions onto sequence datasets '''Maps tabulates PPI regions onto sequence datasets.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### minseq = 3 outdir = 'RegPPI/' adddir = 'RegPPIAdd/' rje.mkDir(self,outdir) rje.mkDir(self,adddir) tabfile = 'ppi_region.tdt' region = rje.dataDict(self,tabfile,['Interactor','Protein'],['Start','End'],lists=True) ### ~ [2] Work through each pair in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### px = 0.0; ptot = len(region); fx = 0 for pair in rje.sortKeys(region): self.progLog('\r#FAS','Generating fasta files: %.2f%%' % (px/ptot)); px += 100.0 ## ~ [2a] Map sequences to PPI dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## [hub, spoke] = string.split(pair,'\t') try: qryseq = self.dict['SeqObj'][spoke] except: self.printLog('\n#QRY','Spoke gene "%s" missing from Sequence file' % spoke); continue try: spoke = self.dict['Seq2Gene'][spoke] except: self.printLog('\n#QRY','Spoke protein "%s" missing from PPI dictionary' % spoke); continue if hub not in self.dict['PPI']: self.printLog('\n#HUB','Hub gene "%s" missing from PPI dictionary' % hub); continue addspoke = spoke not in self.dict['PPI'][hub] if addspoke: self.dict['PPI'][hub].append(spoke) self.printLog('\n#PPI','Added spoke gene "%s" to hub "%s" interactome' % (spoke,hub)) if len(self.dict['PPI'][hub]) < minseq: self.printLog('\n#HUB','Hub "%s" interactome too small (<%s spokes)' % (hub,minseq)); continue ## ~ [2b] Identify query sequence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## reglist = [] for pos in region[pair]['Start'] + region[pair]['End']: reglist.append(string.atoi(pos)) reglist.sort() qsequence = qryseq.info['Sequence'].lower() self.deBug(len(qsequence)) self.deBug(qsequence) prelen = len(qsequence) while reglist: self.deBug(reglist) try: startx = reglist.pop(0) - 1; endx = reglist.pop(0) except: self.errorLog('%s PPI Region problem: %s' % (pair,region[pair])); continue self.deBug(qsequence[startx-1:endx+1].upper()) qsequence = qsequence[:startx] + qsequence[startx:endx].upper() + qsequence[endx:] self.deBug(qsequence) if len(qsequence) != prelen: self.printLog('#F**K','%s' % region[pair]) self.printLog('#F**K',qryseq.info['Sequence'].lower()) self.printLog('#F**K',qsequence) raise ValueError ## ~ [2c] Output sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if addspoke: outfile = '%s%s.%s.fas' % (adddir,hub,spoke); ox = 1 else: outfile = '%s%s.%s.fas' % (outdir,hub,spoke); ox = 1 open(outfile,'w').write('>%s\n%s\n' % (qryseq.info['Name'],qsequence)) for spoke2 in self.dict['PPI'][hub]: if spoke2 == spoke: continue try: sseq = self.dict['SeqObj'][self.dict['Gene2Seq'][spoke2]] open(outfile,'a').write('>%s\n%s\n' % (sseq.info['Name'],sseq.info['Sequence'])) ox += 1 except: pass self.printLog('\n#FAS','%s sequences output to %s' % (ox,outfile)) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def picsi(self): ### Cleans up cross-species search results '''Cleans up cross-species search results.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datafile = self.info['SumFile'] delimit = rje.delimitFromExt(filename=self.info['SumFile']) data = {} # search:{hit:{???}} pep2prot = {} # search:{peptide:[hits]} id2prot = {} # search:{id:hit} prot2desc = {} fullpeplist = {} pepcon = {} # Convert pep:longer pep speclist = [] # List of species codes ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True) for ikey in rje.sortKeys(indata): (search,id) = string.split(ikey,delimit) prot = indata[ikey]['prot_acc'][0] desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','') if desc[3:7] == 'Name': desc = desc[9:] prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc)) indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|') pepconv = string.replace(indata[ikey]['pep_seq'],'I','L') pepconv = string.replace(pepconv,'Q','K') peplist = rje.sortUnique(string.split(pepconv,'|')) indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|') if search not in data: data[search] = {} pep2prot[search] = {} id2prot[search] = {} fullpeplist[search] = [] pepcon[search] = {} fullpeplist[search] += peplist id2prot[search][id] = prot spec = string.split(prot,'_')[1] if spec not in speclist: speclist.append(spec) data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec, 'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:], 'pep_rem':0} try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum'] except: data[search][prot]['accnum'] = string.split(prot,'__')[-1] for pep in peplist: if pep not in pep2prot[search]: pep2prot[search][pep] = [] pep2prot[search][pep].append(prot) ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for search in fullpeplist: fullpeplist[search] = rje.sortUnique(fullpeplist[search]) for pep in fullpeplist[search][0:]: for pep2 in fullpeplist[search]: if pep != pep2 and pep in pep2: pepcon[search][pep] = pep2 fullpeplist[search].remove(pep) break for pep in pepcon[search]: while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]] self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search)) #self.deBug(pepcon[search]) #self.deBug(rje.sortKeys(pep2prot[search])) pp = 0; pm = 0 for prot in data[search]: for pep in data[search][prot]['conpep'][0:]: if pep in pepcon[search]: newpep = pepcon[search][pep] if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1 data[search][prot]['conpep'].remove(pep); pm += 0 if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot) if pep in pep2prot[search]: pep2prot[search].pop(pep) data[search][prot]['pep_con'] = len(data[search][prot]['conpep']) self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm)) ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for search in pep2prot: ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## remx = 0 for prot in data[search]: if data[search][prot]['spec'] != self.info['QrySpec']: continue for pep in data[search][prot]['conpep']: for prot2 in pep2prot[search][pep][0:]: if data[search][prot2]['spec'] == self.info['QrySpec']: continue pep2prot[search][pep].remove(prot2) data[search][prot2]['conpep'].remove(pep) data[search][prot2]['pep_rem'] += 1; remx += 1 self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec'])) ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for prot in data[search]: if len(data[search][prot]['conpep']) < 2: for pep in data[search][prot]['conpep']: #if pep in pep2prot[search] and prot in pep2prot[search][pep]: pep2prot[search][pep].remove(prot) ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ux = 0 for pep in pep2prot[search]: #self.deBug(pep) if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1 self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search)) ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0} rx = 0 for prot in data[search]: #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue data[search][prot]['pep_red'] = 0 # Redundant peptides found in proteins with unique peptides data[search][prot]['pep_nr'] = 0 # Redundant peptides found only in proteins without unique peptides for pep in data[search][prot]['conpep']: if pep2prot[search][pep] == [prot]: continue upep = False for prot2 in pep2prot[search][pep]: if data[search][prot2]['pep_uniq']: upep = True; break if upep: data[search][prot]['pep_red'] += 1 # Redundant peptide found in unique protein else: data[search][prot]['pep_nr'] += 1 # Redundant peptide NOT found in unique protein if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1 elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE' elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR' else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1 summary[data[search][prot]['class']] += 1 self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search)) for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x)) ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### speclist.sort() species = {} for spec in speclist: try: grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read() species[spec] = string.split(grep,':')[-4] self.printLog('#SPEC','%s = %s' % (spec,species[spec])) except: species[spec] = '?' ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile']) headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep'] if self.dict['Acc2Seq']: headers.insert(3,'cluster') rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True) for search in rje.sortKeys(data): if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search]) for prot in rje.sortKeys(data[search]): if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']): data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1] else: data[search][prot]['species'] = species[data[search][prot]['spec']] rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot]) except: self.errorLog('Errg')
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mygo = rje_go.GO(self.log,self.cmd_list) mygo.readGO() gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True) self.deBug(rje.sortKeys(gomap)[:100]) #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!# self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap)))) slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum']) self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc)))) ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list) seqmap = {} (sx,stot) = (0.0,seqlist.seqNum()) for seq in seqlist.seq: self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0 if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0] self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap)))) self.deBug(rje.sortKeys(seqmap)[:100]) ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goocc = {} outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.') headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum'] for okey in slimocc.keys(): self.progLog('#NEW','Making new GO occurrences: %s ' % (rje.integerString(len(slimocc)))) data = slimocc.pop(okey) gene = seq = data['Seq'] type = 'fwd' if string.split(data['Motif'],'_')[-1] in ['rev','scram']: type = string.split(data['Motif'],'_')[-1] data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_') if gene not in gomap and gene in seqmap: gene = seqmap[gene] golist = [] if gene in gomap: for id in gomap[gene]: golist += mygo.parents(id) else: golist = ['NoGo'] self.deBug('%s:%s::%s' % (seq,gene,golist)) for id in rje.sortUnique(golist,False,False): if id not in goocc: goocc[id] = {} if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]} goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data)) self.printLog('\r#NEW','Making new GO occurrences complete. ' % (rje.integerString(len(slimocc)))) rje.delimitedFileOutput(self,outfile,headers,rje_backup=True) (mx,ox,ix,itot) = (0,0,0.0,len(goocc)) for id in rje.sortKeys(goocc): for motif in rje.sortKeys(goocc[id]): for type in rje.sortKeys(goocc[id][motif]): if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type) if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue mx += 1 for type in goocc[id][motif]: for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1 self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox))) self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox))) except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def readPELM( self ): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc. '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.''' try: ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### data = rje.dataDict(self, self.info['PELM'], mainkeys=['acc', 'position']) seqdict = {} # Dictionary of Acc:Sequence ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdict = self.dict['PhosphoSites'] for dkey in data: ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (acc, pos) = string.split(dkey) pos = string.atoi(pos) if acc not in pdict: pdict[acc] = {} if pos not in pdict[acc]: pdict[acc][pos] = {} ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if acc not in seqdict: seqdict[acc] = data[dkey]['sequence'] elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog( '#ERR', 'Warning. Sequence mismatch for %s' % acc) if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code'] elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog( '#ERR', 'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa'])) if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]: self.log.printLog( '#ERR', 'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], seqdict[acc][pos - 1:pos])) ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acclist = rje.sortKeys(seqdict) pelmuni = rje_uniprot.UniProt(self.log, self.cmd_list) # UniProt entry unidict = pelmuni.accDict( acclist) # Dictionary of {acc:UniProtEntry} pelmseq = rje_seq.SeqList(self.log, self.cmd_list + ['seqin=None']) # SeqList object ## ~ [3b] Add one sequence for each AccNum and update seqdict ~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!# for acc in acclist: #!# Make accdict of {acc:Seq} using unidict and seqlist #!# sequence = seqdict[acc] try: uni = unidict[string.split(acc, '-')[0]] desc = uni.obj['Sequence'].info['Description'] name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc, desc) if sequence != uni.obj['Sequence'].info['Sequence']: self.log.printLog( '#WARNING', 'Sequence mismatch for UniProt entry %s' % acc) except: self.log.errorLog('Problem with %s' % acc) name = '%s_UNK__%s' % ( acc, acc) #!# Add sequences where UniProt missing #!# seqdict[acc] = pelmseq._addSeq(name, sequence) ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FilterSeq']: pelmseq.autoFilter() for acc in acclist: if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc) acclist = rje.sortKeys(seqdict) ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not os.path.exists( self.info['PELMFas'] ) or self.stat['Interactive'] < 0 or rje.yesNo( '%s exists: overwrite?' % self.info['PELMFas']): pelmseq.saveFasta(seqfile=self.info['PELMFas']) self.obj['SeqList'] = pelmseq self.obj['UniProt'] = pelmuni except: self.log.errorLog('Problem during PhosphoSeq.readPELM')
def setup(self): ### Loads data into attributes. '''Loads data into attributes.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = self.obj['UniProt'] = rje_uniprot.UniProt(self.log,self.cmd_list) uniprot.readUniProt() if uniprot.entryNum() > 0: ### UniProt data loaded. Populate seqlist and domain dictionary. seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F']) for entry in uniprot.list['Entry']: seq = entry.obj['Sequence'] seqlist.seq.append(entry.obj['Sequence']) name = seq.shortName() self.dict['Entry'][name] = entry self.dict['Seq'][name] = seq for ft in entry.list['Feature']: if ft['Type'] in self.list['DomFT']: try: dom = string.split(ft['Desc'])[0] if dom not in self.dict['Domain']: self.dict['Domain'][dom] = [] if name not in self.dict['Domain'][dom]: self.dict['Domain'][dom].append(name) except: self.errorLog('Trouble with %s feature %s' % (name,ft)) ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: seqlist = rje_seq.SeqList(self.log,self.cmd_list) for seq in seqlist.seq: name = seq.shortName() self.dict['Entry'][name] = None self.dict['Seq'][name] = seq #!# Consider adding loading domains from a table #!# ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['PPI'] # Dictionary of ShortName-centred ppi = rje.dataDict(self,self.info['PPI']) for hub in ppi: if ppi[hub]['EnsLoci'] == '-': continue ens = ppi[hub]['EnsLoci'] if ens not in self.dict['PPI']: self.dict['PPI'][ens] = [] self.dict['Gene'][ens] = hub for gene in string.split(ppi[hub]['PPI'],','): if ppi[gene]['EnsLoci'] == '-': continue if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]: self.dict['PPI'][ens].append(ppi[gene]['EnsLoci']) ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['DDI'] = {} if self.info['DDI'].lower() not in ['','none']: data = rje.dataDict(self,self.info['DDI'],mainkeys=['Name1'],datakeys=['Name2'], headers=['Pfam1','Pfam2','Name1','Name2','Acc1','Acc2','Code1','Code2'],lists=True) ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # (dx,dtot) = (0.0,len(data)) self.deBug(data) try: rje.sortKeys(data) except: self.errorLog('F**k',quitchoice=True) for p1 in rje.sortKeys(data): self.progLog('\r#DDI','Parsing DDI from iPFam: %.1f%%' % (dx/dtot)) if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = [] for p2 in data[p1]['Name2']: if p2 not in self.dict['DDI']: self.dict['DDI'][p2] = [] if p2 not in self.dict['DDI'][p1]: self.dict['DDI'][p1].append(p2) if p1 not in self.dict['DDI'][p2]: self.dict['DDI'][p2].append(p1) self.printLog('\r#DDI','Parsing DDI from iPFam: %s domains' % (rje.integerString(dtot))) ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fam'] = {} if self.info['Fam'].lower() not in ['','none']: data = rje.dataDict(self,self.info['Fam'],mainkeys=['Qry'],datakeys=['Hit'],lists=True) for qry in self.dict['Seq']: self.dict['Fam'][qry] = [] if qry in data: self.dict['Fam'][qry] = data[qry]['Hit'] elif self.dict['Seq'][qry].info['AccNum'] in data: self.dict['Fam'][qry] = data[self.dict['Seq'][qry].info['AccNum']]['Hit'] if qry not in self.dict['Fam'][qry]: self.dict['Fam'][qry].append(qry) except: self.errorLog('Problem with SLiMPID.setup()',quitchoice=True)
def setup(self, gtext='' ): ### Main class setup method. gtext will over-ride input file. '''Main class setup method. gtext will over-ride input file.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['HTML'] = rje_html.HTML(self.log, self.cmd_list) ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.basefile().lower() in ['', 'none']: self.basefile(rje.baseFile(self.getStr('InFile'))) if self.getStr('OutFile').lower() in ['', 'none']: self.str['OutFile'] = '%s.html' % self.basefile() ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## interms = [] if gtext: delimit = self.getStr('TermSplit') if delimit.lower() == 'tab': delimit = '\t' if delimit.lower() == 'space': delimit = ' ' if delimit.lower() == 'comma': delimit = ',' if delimit.lower() == 'period (.)': delimit = '.' if delimit.lower() == 'colon': delimit = ':' glossary = {} for line in string.split(gtext, '\n'): splitline = string.split(line, delimit) if delimit == '.' and (splitline[-1] in ['', ' ']): splitline = splitline[:-1] if not splitline: continue (term, definition) = (splitline[0], string.join(splitline[1:], delimit)) if term == 'Term' and not glossary: continue if term: glossary[term] = {'Definition': definition} interms.append(term) else: try: if not self.getBool('KeepOrder') and open( self.getStr('InFile'), 'r').readline()[:4] == 'Term': glossary = rje.dataDict( self, self.getStr('InFile'), mainkeys=['Term'], datakeys=['Term', 'Definition']) else: return self.setup( open(self.getStr('InFile'), 'r').read()) except: self.errorLog( 'Problem reading input as dataDict(). Will try as text.' ) return self.setup(open(self.getStr('InFile'), 'r').read()) if self.list['Terms']: for term in glossary: if term not in self.list['Terms']: glossary.pop(term) elif self.getBool('KeepOrder'): self.list['Terms'] = interms else: self.list['Terms'] = rje.sortKeys(glossary) for term in glossary: glossary[term] = glossary[term]['Definition'] ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nested = {} for term in glossary: tdict = nested for word in string.split(term.lower()): if word not in tdict: tdict[word] = {} tdict = tdict[word] tdict['='] = glossary[term] self.dict['Glossary'] = nested return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt' self.progLog('\r#PPI', 'Loading pairwise data...') pairwise = rje.dataDict(self, ppipairwise, ['Hub', 'Spoke'], ['Spoke', 'SpokeSeq', 'Evidence']) gene2seq = {} seq2gene = {} fullppi = {} px = 0.0 ptot = len(pairwise) ppix = 0 for pair in rje.sortKeys(pairwise): self.progLog( '\r#PPI', 'Processing full pairwise PPI: %.2f%%' % (px / ptot)) px += 100.0 [hub, spoke] = string.split(pair, '\t') if spoke not in gene2seq: sseq = pairwise[pair]['SpokeSeq'] gene2seq[spoke] = sseq seq2gene[string.split(sseq, '__')[0]] = spoke if hub not in fullppi: fullppi[hub] = {} if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence'] ppix += 1 self.printLog( '\r#PPI', 'Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)), rje.integerString(ppix / 2))) ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt' goodppi = self.loadFromFile(goodppifile, chomplines=True) self.dict['PPI'] = {} px = 0.0 ptot = len(fullppi) fppix = ppix ppix = 0 for hub in fullppi: self.progLog( '\r#PPI', 'Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px / ptot, rje.integerString(len( self.dict['PPI'])), rje.integerString(ppix))) px += 100.0 self.dict['PPI'][hub] = [] for spoke in fullppi[hub]: goodspoke = False for ptype in goodppi: if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True break if goodspoke: self.dict['PPI'][hub].append(spoke) continue goodspoke = True for spoke2 in fullppi[hub]: if spoke2 in [hub, spoke]: continue if spoke2 in fullppi[spoke]: goodspoke = False break if goodspoke: self.dict['PPI'][hub].append(spoke) ppix += len(self.dict['PPI'][hub]) if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub) self.printLog( '\r#PPI', 'Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString( len(fullppi)), rje.integerString(len(self.dict['PPI'])), rje.integerString(fppix / 2), rje.integerString(ppix / 2))) ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas' scmd = ['accnr=F', 'seqnr=F', 'seqin=%s' % seqfile] + self.cmd_list + ['autoload=T'] seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd) self.dict['SeqObj'] = seqlist.seqNameDic('Max') self.dict['Gene2Seq'] = gene2seq self.dict['Seq2Gene'] = seq2gene return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta,'r').readlines(): if line[:1] == '>': try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line) except: name = rje.matchExp('^>(\S+)',line)[0] if len(string.split(name,'|')) == 3: name = '6rf_NEIME__%s' % string.split(name,'|')[2] open(fas,'a').write('>%s\n' % name) elif len(string.split(name,'|')) == 5: name = 'ref_NEIME__%s' % string.split(name,'|')[3] open(fas,'a').write('>%s %s\n' % (name,desc)) else: print string.split(name,'|'); raise ValueError self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1 else: open(fas,'a').write(line) self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc'; open(acc,'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv,'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line,',') if len(data) < 2: continue [num,name] = data[:2] try: name = string.split(name,'|')[2] except: continue if name not in rfhits: open(acc,'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file,num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits)))) self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self,gfile,['Qry'],['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros,False) open('6rf_zeros.acc','w').write(string.join(zeros,'\n')) self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam() gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True) fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True) headers = gdata.pop('Headers') headers.insert(1,'Sample') headers.append('BestHit') rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf,'__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN',rje_zen.Zen().wisdom())
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Basefile'].lower() in ['', 'none']: self.info['Basefile'] = '' elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.' self.obj['SeqList'] = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=T']) self.list['PlotFT'] = string.split( string.join(self.list['PlotFT']).upper()) if self.info['OccFile'].lower() not in ['', 'none']: self.info['Delimit'] = rje.delimitFromExt( filename=self.info['OccFile']) self.dict['OccData'] = {} occdata = rje.dataDict( self, self.info['OccFile'], ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'], ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos']) for key in rje.sortKeys(occdata): seq = occdata[key].pop('Seq') if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {} dataset = occdata[key].pop('Dataset') if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = [] self.dict['OccData'][seq][dataset].append(occdata[key]) self.printLog( '#OCC', 'Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)), rje.integerString(len(self.dict['OccData'])))) self.obj['SeqList'].autoFilter([ 'GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']), ',') ]) ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['PlotStat'] = string.split( string.join(self.list['PlotStat']).lower()) if 'cons' in self.list['PlotStat'] or 'rel' in self.list[ 'PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log, self.cmd_list) seqdict = self.obj['SeqList'].seqNameDic() for name in rje.sortKeys(seqdict): if self.opt['OccOnly'] and not name in self.dict['OccData']: continue seq = seqdict[name] sequence = seq.getSequence(gaps=False) seq.dict['PlotStat'] = {} if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility( sequence, returnlist=True) if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat'][ 'Hydropathy'] = rje_seq.eisenbergHydropathy( sequence, returnlist=True) if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder( returnlist=True) if 'cons' in self.list['PlotStat'] or 'rel' in self.list[ 'PlotStat']: slimcalc.relConListFromSeq(seq, slimcalc.stat['RelConWin'], store=True) try: seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons') seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop( 'RelCons') except: self.printLog('#CONS', 'No conservation stats for %s' % name) self.printLog('#STAT', 'PlotStats calculated for %s' % name) for stat in seq.dict['PlotStat']: if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin( seq.dict['PlotStat'][stat]) seq.dict['PlotStat'][stat] = self.convertStat( seq.dict['PlotStat'][stat]) self.printLog('#STAT', 'PlotStats converted for %s' % name) ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if name in self.dict['OccData']: for dataset in self.dict['OccData'][name]: ofile = '%s%s.%s.plot.txt' % ( self.info['Basefile'], dataset, seq.info['AccNum']) self.output(seq, ofile, self.dict['OccData'][name][dataset]) else: self.output( seq, '%s%s.plot.txt' % (self.info['Basefile'], seq.info['AccNum'])) return except: self.errorLog(rje_zen.Zen().wisdom())
def readHGNC(self): ### Read links from HGNC into data structure '''Read links from HGNC into data structure.''' try:### ~ [1] Read into dictionary with HGNC ID as key ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['HGNCData'].lower() in ['','none']: return if not os.path.exists(self.info['HGNCData']): return self.log.errorLog('HGNC file "%s" not found' % (self.info['HGNCData']),printerror=False) hgncdata = rje.dataDict(self,self.info['HGNCData'],['HGNC ID']) aliaii = {} # Dictionary of withdrawn symbols to map ### ~ [2] Parse out information ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx,htot) = (0.0,len(hgncdata)) for hgnc in rje.sortKeys(hgncdata): self.log.printLog('\r#HGNC','Processing HGNC: %.1f%%' % (hx/htot),newline=False,log=False) hx += 100.0 ## ~ [2a] Adjust headers for new vs old HGNC compatibility ~~~~~~~~~~~~~~~~~~~~~~~~ ## data = hgncdata[hgnc] for hkey in rje.sortKeys(data): if rje.matchExp('^(\S.+\S)\s*\(mapped data supplied by \S+\)',hkey): data['%s (mapped data)' % rje.matchExp('^(\S.+\S)\s*\(mapped data supplied by \S+\)',hkey)[0]] = data.pop(hkey) if rje.matchExp('^(\S.+\S)\s*\(supplied by \S+\)',hkey): data['%s (mapped data)' % rje.matchExp('^(\S.+\S)\s*\(supplied by \S+\)',hkey)[0]] = data.pop(hkey) ## ~ [2b] Make dictionary of Genecards data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gdict = {} gdict['Symbol'] = gene = data['Approved Symbol'].upper() gdict['Desc'] = data['Approved Name'] ## ~ [2c] Special treatment of obselete symbol ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if gene.find('~withdrawn') > 0: ### Obselete symbol try: gene = gene[:gene.find('~WITHDRAWN')] alias = rje.matchExp(', see (\S+)',gdict['Desc'])[0] if len(string.split(alias)) > 1: continue # Ambiguous if gene in aliaii and aliaii[gene] != alias: aliaii[gene] = 'AMBIGUOUS' else: aliaii[gene] = alias except: pass continue ## ~ [2d] Add additional aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'Synonyms' in data and 'Aliases' not in data: data['Aliases'] = data.pop('Synonyms') for alias in string.split(data['Previous Symbols'].upper(),', ') + string.split(data['Aliases'].upper(),', '): #x# if alias.upper() != alias: continue # Not really a symbol if alias in self.dict['GeneCard']: aliaii[alias] = 'AMBIGUOUS' if alias in aliaii and aliaii[alias] != gene: aliaii[alias] = 'AMBIGUOUS' else: aliaii[alias] = gene if gene in aliaii: aliaii[gene] = 'AMBIGUOUS' gdict['Entrez'] = data['Entrez Gene ID'] if not gdict['Entrez']: gdict['Entrez'] = data['Entrez Gene ID (mapped data)'] gdict['OMIM'] = data['OMIM ID (mapped data)'] gdict['UniProt'] = data['UniProt ID (mapped data)'] gdict['EnsEMBL'] = ensgene = data['Ensembl ID (mapped data)'] gdict['HGNC'] = string.replace(hgnc,'HGNC:','') if not gene: gene = ensgene if not gene: self.log.errorLog('HGNC has no gene for %s: %s' % (gdict['HGNC'],data),printerror=False) continue #x#self.deBug(data) self.dict['GeneCard'][gene] = gdict if self.opt['FullHGNC'] and gene not in self.list['Genes']: self.list['Genes'].append(gene) ## ~ [2b] Deal with EnsGene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FullEns'] and ensgene: if ensgene not in self.list['Genes']: self.list['Genes'].append(ensgene) if ensgene not in self.dict['GeneCard']: self.dict['GeneCard'][ensgene] = {} rje.combineDict(self.dict['GeneCard'][ensgene],gdict,overwrite=False,replaceblanks=True) #x#self.deBug(aliaii) self.log.printLog('\r#HGNC','Processed HGNC: %s genes & %s aliases' % (rje.integerString(len(self.dict['GeneCard'])),rje.integerString(len(aliaii)))) ### ~ [3] Deal with aliaii ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ambig = [] (hx,htot) = (0.0,len(aliaii)) for alias in aliaii: self.log.printLog('\r#HGNC','Processing HGNC aliases: %.1f%%' % (hx/htot),newline=False,log=False) hx += 100.0 if aliaii[alias] == 'AMBIGUOUS': ambig.append(alias) continue # Alias mapped to multiple genes while aliaii[alias] not in self.dict['GeneCard'] and aliaii[alias] in aliaii: aliaii[alias] = aliaii[aliaii[alias]] # Map through several aliases if needed if aliaii[alias] not in self.dict['GeneCard']: continue # Alias is not a valid Gene, so ignore if alias not in self.dict['GeneCard']: self.dict['GeneCard'][alias] = self.dict['GeneCard'][aliaii[alias]] if self.opt['FullHGNC'] and alias not in self.list['Genes']: self.list['Genes'].append(alias) self.log.printLog('\r#HGNC','Processed HGNC: %s genes & aliases' % (rje.integerString(len(self.dict['GeneCard'])))) if ambig: self.log.printLog('#AMB','%s ambiguous aliases were not mapped' % rje.integerString(len(ambig))) open('hgnc.ambiguities.txt','w').write(string.join(ambig,'\n')) except: self.log.errorLog(rje_zen.Zen().wisdom())
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta, 'r').readlines(): if line[:1] == '>': try: (name, desc) = rje.matchExp('^>(\S+) (\S.+)$', line) except: name = rje.matchExp('^>(\S+)', line)[0] if len(string.split(name, '|')) == 3: name = '6rf_NEIME__%s' % string.split(name, '|')[2] open(fas, 'a').write('>%s\n' % name) elif len(string.split(name, '|')) == 5: name = 'ref_NEIME__%s' % string.split(name, '|')[3] open(fas, 'a').write('>%s %s\n' % (name, desc)) else: print string.split(name, '|') raise ValueError self.progLog( '\r#FAS', 'Processing %s: %s seqs' % (fas, rje.integerString(sx))) sx += 1 else: open(fas, 'a').write(line) self.printLog( '\r#FAS', 'Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(fas, protein=True, force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc' open(acc, 'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv, 'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line, ',') if len(data) < 2: continue [num, name] = data[:2] try: name = string.split(name, '|')[2] except: continue if name not in rfhits: open(acc, 'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file, num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog( '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' % (cx, rje.integerString(len(rfhits)))) self.printLog( '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' % (cx, rje.integerString(len(rfhits)), acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=%s' % acc, 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas', 'qryacc=F' ]).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros, False) open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n')) self.printLog( '#ZERO', '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Zeros.fas', 'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas', 'qryacc=F' ]).gablam() gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True) fdata = rje.dataDict(self, string.replace(ufile, 'hitsum', 'gablam'), ['Qry'], ['Hit'], lists=True) headers = gdata.pop('Headers') headers.insert(1, 'Sample') headers.append('BestHit') rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf, '__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN', rje_zen.Zen().wisdom())