def __init__(self,verbose=True): """ Constructor """ ## setup db self.session, self.engine = db_connect(verbose=verbose) self.conn = self.engine.connect() ## global variables self.verbose = verbose self.dpi = 400 self.labelOffset = 0.05 self.fontSize = 8 self.legendFontSize = 8 self.increment = 0.03 self.fontName = 'serif' self.termAlpha = 0.95 self.geneAlpha = 0.4 self.nodeSizeGene = 300 self.nodeSizeTerm = 300 self.colors = ['#000000','#FFCC33','#3333DD',"#002200",'y','b','orange',"#CC55FF","#990033",'#FF6600',"#CCCCCC","#660033", '#FFCC00','#FFFFAA','#6622AA','#33FF77','#998800','#0000FF',"#995599","#00AA00","#777777","#FF0033",'#990066', '#FA58AC','#8A0808','#D8D8D8',"#CC2277",'#336666','#996633',"#FFCCCC","#CC0011","#FFBB33","#DDDDDD","#991188", "#FF9966","#009999","#FF0099","#996633","#990000","#660000","#9900BB","#330033","#FF5544","#9966CC", "#330066","#99FF99","#FF99FF","#333333","#CC3333","#CC9900","#99DD22","#3322BB","#663399","#002255", "#003333","#66CCFF","#CCFFFF","#AA11BB","#000011","#FFCCFF","#00EE33","#337722","#CCBBFF","#FF3300", "#009999","#110000","#AAAAFF","#990000","#880022","#BBBBBB","#00EE88","#66AA22","#99FFEE","#660022", "#FFFF33","#00CCFF","#990066","#006600","#00CCFF",'#AAAAAA',"#33FF00","#0066FF","#FF9900","#FFCC00"] self.cmap = plt.cm.Blues self.lineEnd = 53 self.linesMax = 32
def setUp(self): """ connect to the database """ self.session, self.engine = db_connect(upass=UPASS) self.conn = self.engine.connect() self.testID = '5476'
def __init__(self): """ Constructor """ self.session,self.engine = db_connect() self.conn = self.engine.connect() self.hits = None
def __init__(self): """ Constructor """ self.session, self.engine = db_connect() self.conn = self.engine.connect() self.hits = None
def __init__(self,taxaInput,delimiter="\t",upass=''): """ The input is a path to a tab delimited file or simply a python list The delimiter can also be specified The first column is the ncbi taxa id All additional columns are optional and user specified """ self.taxaList = [] self.session,self.engine = db_connect(upass=upass) if type(taxaInput) == type([]): for taxon in taxaInput: if re.search("\D",taxon): raise Exception("Bad taxa in taxaList (%s)"%(taxon)) continue self.taxaList.append(taxon) else: self.taxaList = list(set(self.read_file(taxaInput))) print("There are %s unique taxa in the list"%(len(self.taxaList)))
def __init__(self,taxaList,verbose=False,upass='',idType='ncbi',useIea=True,\ aspect='biological_process'): """ Constructor taxaList a list of NCBI taxa ids """ ## error checking idType = idType.lower() if idType not in ['uniprot','ncbi']: raise Exception("Invalid idType argument in fetch annotations use 'uniprot' or 'ncbi'") ## start a database session self.session,self.engine = db_connect(verbose=verbose,upass=upass) ## global variables self.taxaList = taxaList self.idType = idType self.useIea = useIea self.aspect = aspect
def __init__(self, taxaInput, delimiter="\t", upass=''): """ The input is a path to a tab delimited file or simply a python list The delimiter can also be specified The first column is the ncbi taxa id All additional columns are optional and user specified """ self.taxaList = [] self.session, self.engine = db_connect(upass=upass) if type(taxaInput) == type([]): for taxon in taxaInput: if re.search("\D", taxon): raise Exception("Bad taxa in taxaList (%s)" % (taxon)) continue self.taxaList.append(taxon) else: self.taxaList = list(set(self.read_file(taxaInput))) print("There are %s unique taxa in the list" % (len(self.taxaList)))
def __init__(self, verbose=True): """ Constructor """ ## setup db self.session, self.engine = db_connect(verbose=verbose) self.conn = self.engine.connect() ## global variables self.verbose = verbose self.dpi = 400 self.labelOffset = 0.05 self.fontSize = 8 self.legendFontSize = 8 self.increment = 0.03 self.fontName = 'serif' self.termAlpha = 0.95 self.geneAlpha = 0.4 self.nodeSizeGene = 300 self.nodeSizeTerm = 300 self.colors = [ '#000000', '#FFCC33', '#3333DD', "#002200", 'y', 'b', 'orange', "#CC55FF", "#990033", '#FF6600', "#CCCCCC", "#660033", '#FFCC00', '#FFFFAA', '#6622AA', '#33FF77', '#998800', '#0000FF', "#995599", "#00AA00", "#777777", "#FF0033", '#990066', '#FA58AC', '#8A0808', '#D8D8D8', "#CC2277", '#336666', '#996633', "#FFCCCC", "#CC0011", "#FFBB33", "#DDDDDD", "#991188", "#FF9966", "#009999", "#FF0099", "#996633", "#990000", "#660000", "#9900BB", "#330033", "#FF5544", "#9966CC", "#330066", "#99FF99", "#FF99FF", "#333333", "#CC3333", "#CC9900", "#99DD22", "#3322BB", "#663399", "#002255", "#003333", "#66CCFF", "#CCFFFF", "#AA11BB", "#000011", "#FFCCFF", "#00EE33", "#337722", "#CCBBFF", "#FF3300", "#009999", "#110000", "#AAAAFF", "#990000", "#880022", "#BBBBBB", "#00EE88", "#66AA22", "#99FFEE", "#660022", "#FFFF33", "#00CCFF", "#990066", "#006600", "#00CCFF", '#AAAAAA', "#33FF00", "#0066FF", "#FF9900", "#FFCC00" ] self.cmap = plt.cm.Blues self.lineEnd = 53 self.linesMax = 32
def __init__(self,taxaList,verbose=False,upass='',idType='ncbi',useIea=True,\ aspect='biological_process'): """ Constructor taxaList a list of NCBI taxa ids """ ## error checking idType = idType.lower() if idType not in ['uniprot', 'ncbi']: raise Exception( "Invalid idType argument in fetch annotations use 'uniprot' or 'ncbi'" ) ## start a database session self.session, self.engine = db_connect(verbose=verbose, upass=upass) ## global variables self.taxaList = taxaList self.idType = idType self.useIea = useIea self.aspect = aspect
def enrichment_hypergeo( termList, entityList, species, useIea=True, asGenes=True, aspect="biological_process", verbose=True ): """ termList -- are the terms to be tested species -- an ncbi taxa id entityList -- gene or uniprot ids What is the probability of finding a given number of terms if we randomly select N out of M objects? M -- genes with at least one annotation N -- number of draws or size of gene list k -- the number of genes annotated by a given term (total type I objects) x -- number of times we observe a term in the gene list (draws) in R the cdf can be obtained with phyper(x,k,M-k,N) hypergeom.pmf(x, M, k, N) Returns a dict where term id is the key and hypergeo pvalue is the value """ ## connect to db and get annotations for the species session, engine = db_connect() geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect) if asGenes == True: entity2go = geneAnnots else: entity2go = uniprotAnnots go2entity = {} for entity, go in entity2go.iteritems(): for term in go: if not go2entity.has_key(term): go2entity[term] = set([]) go2entity[term].update([entity]) for go, entity in go2entity.iteritems(): go2entity[go] = list(entity) print ("total go terms - %s" % (len(go2entity.keys()))) print ("total entities - %s" % (len(entity2go.keys()))) ## set variables M = len(entity2go.keys()) N = len(entityList) results = {} for testTerm in termList: ## find k = len(go2entity[testTerm]) x = 0 for entity in entityList: if entity in entity2go and testTerm in entity2go[entity]: x += 1 ## get a p-value if 0 in [x, M, N, k]: pvalue = np.nan else: cdf = hypergeom.cdf(x, M, k, N, loc=0) if cdf > 0: pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N)) else: pvalue = 2 * hypergeom.cdf(x, M, k, N) results[testTerm] = pvalue return results
#!/usr/bin/env python """ probe the taxa in the list for annotation coverage and summary information """ import sys, getopt, os from htsint import TaxaSummary from htsint.database import db_connect self.session, self.engine = db_connect() ## get how many genes code for proteins codingQuery = self.session.query(Uniprot).filter_by(taxa_id=taxaQuery.id).all() codingGenes = list(set([u.gene_id for u in uniprotQuery])) remove_empty(codingGenes) ## get number of genes/proteins with at least one annotation annotatedGenes = list(set([a.gene_id for a in annotations])) annotatedProts = list(set([a.uniprot_id for a in annotations])) remove_empty(annotatedGenes) remove_empty(annotatedProts) apQuery = [ self.session.query(Uniprot).filter_by(id=uid).first() for uid in annotatedProts ] #apQuery = self.session.query(Uniprot).filter(Uniprot.id.in_(annotatedProts)).all()
def enrichment_hypergeo(termList, entityList, species, useIea=True, asGenes=True, aspect='biological_process', verbose=True): ''' termList -- are the terms to be tested species -- an ncbi taxa id entityList -- gene or uniprot ids What is the probability of finding a given number of terms if we randomly select N out of M objects? M -- genes with at least one annotation N -- number of draws or size of gene list k -- the number of genes annotated by a given term (total type I objects) x -- number of times we observe a term in the gene list (draws) in R the cdf can be obtained with phyper(x,k,M-k,N) hypergeom.pmf(x, M, k, N) Returns a dict where term id is the key and hypergeo pvalue is the value ''' ## connect to db and get annotations for the species session, engine = db_connect() geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect) if asGenes == True: entity2go = geneAnnots else: entity2go = uniprotAnnots go2entity = {} for entity, go in entity2go.items(): for term in go: if term not in go2entity: go2entity[term] = set([]) go2entity[term].update([entity]) for go, entity in go2entity.items(): go2entity[go] = list(entity) print('total go terms - %s' % (len(go2entity.keys()))) print('total entities - %s' % (len(entity2go.keys()))) ## set variables M = len(entity2go.keys()) N = len(entityList) results = {} for testTerm in termList: ## find k = len(go2entity[testTerm]) x = 0 for entity in entityList: if entity in entity2go and testTerm in entity2go[entity]: x += 1 ## get a p-value if 0 in [x, M, N, k]: pvalue = np.nan else: cdf = hypergeom.cdf(x, M, k, N, loc=0) if cdf > 0: pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N)) else: pvalue = 2 * hypergeom.cdf(x, M, k, N) results[testTerm] = pvalue return results
def get_blast_map(resultsFilePath, evalue=0.00001, taxaList=None, asGenes=False, append=False): """ load assembly blast results into dictionary if taxaList is provided then only genes from given taxa will be included in map if asGene == True the results are provided with keys to genes not isoforms """ if not os.path.exists(resultsFilePath): raise Exception("cannot find results file path %s" % resultsFilePath) if taxaList != None: ## prepare database connections session, engine = db_connect() conn = engine.connect() s = select([Taxon.id, Taxon.ncbi_id, Taxon.name]).where(Taxon.ncbi_id.in_(taxaList)) _taxaQueries = conn.execute(s) taxaQueries = _taxaQueries.fetchall() totalQueries = set([]) filteredQueries = set([]) filteredHits = set([]) selectedTaxa = [str(tquery['id']) for tquery in taxaQueries] taxa2name = dict([(str(tquery['id']), str(tquery['ncbi_id'])) for tquery in taxaQueries]) ## create a gene2taxa dictionary #gene2taxa,gene2desc = {},{} #for tquery in taxaQueries: # s = select([Gene.taxa_id,Gene.ncbi_id,Gene.description],Gene.taxa_id==tquery['id']) # _geneQueries = conn.execute(s) # geneQueries = _geneQueries.fetchall() # gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in geneQueries])) # gene2desc.update(dict([(str(r['ncbi_id']),str(r['description'])) for r in geneQueries])) results = {} fid = open(resultsFilePath, 'rU') reader = csv.reader(fid) header = reader.next() print(header) ## loop through file and save best uniqueQueries = set([]) totalQueries = 0 unfilteredQueries = 0 for linja in reader: if len(linja) == 4: queryId = linja[0] hitId = linja[1] hitNcbiId = linja[2] _evalue = float(linja[3]) else: queryId = linja[0] queryNcbi = linja[1] hitId = linja[2] hitNcbiId = linja[3] _evalue = linja[4] if asGenes == True: queryId = re.sub("_i\d+", "", queryId) # filtering totalQueries += 1 if '-' in linja: continue if _evalue > evalue: continue if taxaList and gene2taxa.has_key(str(hitNcbiId)) == False: continue unfilteredQueries += 1 uniqueQueries.update([queryId]) ## use the best evalue if not results.has_key(queryId): if append: results[queryId] = [(hitNcbiId, _evalue)] else: results[queryId] = (hitNcbiId, _evalue) if _evalue < results[queryId][1]: if append: results[queryId].append((hitNcbiId, _evalue)) else: results[queryId] = (hitNcbiId, _evalue) uniqueQueries = list(uniqueQueries) fid.close() print("total queries: %s" % totalQueries) print("unfiltered queries: %s" % unfilteredQueries) print("unique: %s" % len(uniqueQueries)) return results
def create_blast_map(refTaxon, taxaList, resultsFilePath, evalue=0.00001, verbose=False): """ read a summarized reference blast results file and create a map results are gene centric example results file looks like this query(refseq),query(geneId),hit(uniprotEntry),hit(geneEntry),e-value NP_001016845.1,549599,AQP3_HUMAN,360,1.24637e-170 NP_001016845.1,549599,AQP9_HUMAN,366,7.57313e-92 NP_001016845.1,549599,AQP10_HUMAN,89872,2.87154e-85 NP_001016845.1,549599,AQP7_HUMAN,364,8.01308e-84 NP_001016845.1,549599,AQP5_HUMAN,362,4.267e-15 """ ## error check if not os.path.exists(resultsFilePath): raise Exception("cannot find results file path %s" % resultsFilePath) if refTaxon not in taxaList: raise Exception("refTaxon must be in taxaList") ## prepare database connections session, engine = db_connect() conn = engine.connect() ## read through the file to map the genes to taxa ids s = select([Taxon.id, Taxon.ncbi_id, Taxon.name]).where(Taxon.ncbi_id.in_(taxaList)) _taxaQueries = conn.execute(s) taxaQueries = _taxaQueries.fetchall() totalQueries = set([]) filteredQueries = set([]) filteredHits = set([]) selectedTaxa = [str(tquery['id']) for tquery in taxaQueries] taxa2name = dict([(str(tquery['id']), str(tquery['ncbi_id'])) for tquery in taxaQueries]) ## create a gene2taxa dictionary gene2taxa = {} for tquery in taxaQueries: s = select([Gene.taxa_id, Gene.ncbi_id], Gene.taxa_id == tquery['id']) _geneQueries = conn.execute(s) gene2taxa.update( dict([(str(r['ncbi_id']), str(r['taxa_id'])) for r in _geneQueries.fetchall()])) ## creats a dictionary results['geneId]['taxaId'] = bestHitMappedTaxa results = {} fid = open(resultsFilePath, 'rU') reader = csv.reader(fid) header = reader.next() ## loop through file and save best for linja in reader: _evalue = float(linja[4]) totalQueries.update([linja[1]]) ## filter by species and evalue if not gene2taxa.has_key(linja[3]) or _evalue > evalue: continue if linja[1] == '-' or linja[3] == '-': continue ## filter self matches if taxa2name[gene2taxa[linja[3]]] == refTaxon: continue if taxa2name[gene2taxa[linja[1]]] != refTaxon: raise Exception("Invalid query or invalid refTaxon %s != %s" % (taxa2name[gene2taxa[linja[1]]], refTaxon)) taxId = taxa2name[gene2taxa[linja[3]]] filteredQueries.update([linja[1]]) filteredHits.update([linja[3]]) if not results.has_key(taxId): results[taxId] = {} ## use the best evalue if not results[taxId].has_key(linja[1]): results[taxId][linja[1]] = (linja[3], _evalue) if _evalue < results[taxId][linja[1]][1]: results[taxId][linja[1]] = (linja[3], _evalue) fid.close() ## returns a simplified form of the results as two mappers mapper1, mapper2 = {}, {} test1 = set([]) for taxId in results.iterkeys(): for queryGene, hit in results[taxId].iteritems(): if not mapper1.has_key(hit[0]): mapper1[hit[0]] = [] mapper1[hit[0]].append(queryGene) if not mapper2.has_key(queryGene): mapper2[queryGene] = [] mapper2[queryGene].append(hit[0]) #debug = set([]) #for key, item in mapper.iteritems(): # debug.update(item) #for tquery in taxaQueries: # taxId = str(tquery['ncbi_id']) # for key,item in results[taxId].iteritems(): # debug.update([key]) # mapper[item[0]] = key #debug = list(debug) #print 'debug', len(debug),missing #print results.keys(),len(results['8364'].keys()), len(results['8355'].keys()), len(list(set(results['8364'].keys() + results['8355'].keys()))) print('BLAST: total queries: %s' % (len(list(totalQueries)))) print('BLAST: filtered queries (evalue=%s)(taxa=%s): %s' % (evalue, str(taxaList), len(list(filteredQueries)))) print('BLAST: filtered hits: %s' % (len(list(filteredHits)))) return mapper1, mapper2
import time,sys from htsint.database import db_connect,Gene,GoAnnotation,GoTerm from htsint.database import fetch_annotations,gene_mapper ## variables session,engine = db_connect() geneList = ['30970']#,'30971','30972','30973','30975'] expEvidCodes = ["EXP","IDA","IPI","IMP","IGI","IEP"] compEvidCodes = ["ISS","ISO","ISA","ISM","IGC","RCA"] statEvidCodes = ["TAS","NAS","IC"] nonCuratedEvidCodes = ["IEA"] acceptedCodes = expEvidCodes + statEvidCodes annotations = {} aspect = 'biological_process' timeStart = time.time() geneQueries = session.query(Gene).filter(Gene.ncbi_id.in_(geneList)).all() print("...extraction time 1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) #timeStart = time.time() #geneIdMap = gene_mapper(session,ncbiIdList=geneList) #print("...extraction time 1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) ## get all results timeStart = time.time() for geneQuery in geneQueries: annotations[geneQuery.ncbi_id] = set([]) print geneQuery.ncbi_id annotations[geneQuery.ncbi_id].update(session.query(GoAnnotation).filter_by(gene_id=geneQuery.id).all()) print("...extraction q1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
def create_blast_map(refTaxon,taxaList,resultsFilePath,evalue=0.00001,verbose=False): """ read a summarized reference blast results file and create a map results are gene centric example results file looks like this query(refseq),query(geneId),hit(uniprotEntry),hit(geneEntry),e-value NP_001016845.1,549599,AQP3_HUMAN,360,1.24637e-170 NP_001016845.1,549599,AQP9_HUMAN,366,7.57313e-92 NP_001016845.1,549599,AQP10_HUMAN,89872,2.87154e-85 NP_001016845.1,549599,AQP7_HUMAN,364,8.01308e-84 NP_001016845.1,549599,AQP5_HUMAN,362,4.267e-15 """ ## error check if not os.path.exists(resultsFilePath): raise Exception("cannot find results file path %s"%resultsFilePath) if refTaxon not in taxaList: raise Exception("refTaxon must be in taxaList") ## prepare database connections session,engine = db_connect() conn = engine.connect() ## read through the file to map the genes to taxa ids s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(taxaList)) _taxaQueries = conn.execute(s) taxaQueries = _taxaQueries.fetchall() totalQueries = set([]) filteredQueries = set([]) filteredHits = set([]) selectedTaxa = [str(tquery['id']) for tquery in taxaQueries] taxa2name = dict([(str(tquery['id']),str(tquery['ncbi_id'])) for tquery in taxaQueries]) ## create a gene2taxa dictionary gene2taxa = {} for tquery in taxaQueries: s = select([Gene.taxa_id,Gene.ncbi_id],Gene.taxa_id==tquery['id']) _geneQueries = conn.execute(s) gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in _geneQueries.fetchall()])) ## creats a dictionary results['geneId]['taxaId'] = bestHitMappedTaxa results = {} fid = open(resultsFilePath,'rU') reader = csv.reader(fid) header = reader.next() ## loop through file and save best for linja in reader: _evalue = float(linja[4]) totalQueries.update([linja[1]]) ## filter by species and evalue if not gene2taxa.has_key(linja[3]) or _evalue > evalue: continue if linja[1] == '-' or linja[3] == '-': continue ## filter self matches if taxa2name[gene2taxa[linja[3]]] == refTaxon: continue if taxa2name[gene2taxa[linja[1]]] != refTaxon: raise Exception("Invalid query or invalid refTaxon %s != %s"%(taxa2name[gene2taxa[linja[1]]],refTaxon)) taxId = taxa2name[gene2taxa[linja[3]]] filteredQueries.update([linja[1]]) filteredHits.update([linja[3]]) if not results.has_key(taxId): results[taxId] = {} ## use the best evalue if not results[taxId].has_key(linja[1]): results[taxId][linja[1]] = (linja[3],_evalue) if _evalue < results[taxId][linja[1]][1]: results[taxId][linja[1]] = (linja[3],_evalue) fid.close() ## returns a simplified form of the results as two mappers mapper1,mapper2 = {},{} test1 = set([]) for taxId in results.iterkeys(): for queryGene,hit in results[taxId].iteritems(): if not mapper1.has_key(hit[0]): mapper1[hit[0]] = [] mapper1[hit[0]].append(queryGene) if not mapper2.has_key(queryGene): mapper2[queryGene] = [] mapper2[queryGene].append(hit[0]) #debug = set([]) #for key, item in mapper.iteritems(): # debug.update(item) #for tquery in taxaQueries: # taxId = str(tquery['ncbi_id']) # for key,item in results[taxId].iteritems(): # debug.update([key]) # mapper[item[0]] = key #debug = list(debug) #print 'debug', len(debug),missing #print results.keys(),len(results['8364'].keys()), len(results['8355'].keys()), len(list(set(results['8364'].keys() + results['8355'].keys()))) print('BLAST: total queries: %s'%(len(list(totalQueries)))) print('BLAST: filtered queries (evalue=%s)(taxa=%s): %s'%(evalue,str(taxaList),len(list(filteredQueries)))) print('BLAST: filtered hits: %s'%(len(list(filteredHits)))) return mapper1,mapper2
#!/usr/bin/python import time from sqlalchemy.sql import select from htsint.database import db_connect, Taxon, Gene, Uniprot, Refseq from htsint.database import uniprot_mapper session, engine = db_connect() conn = engine.connect() uniprotEntries = [ "KCNQ4_MOUSE", "CSMT1_XENTR", "CSMT1_MOUSE", "MILK2_MOUSE", "MILK2_RAT", "MILK1_RAT", "MILK1_MOUSE", "MICA3_MOUSE", "MCA3A_DANRE", "MCA3B_DANRE", "MICA1_DANRE", "MCA2B_DANRE", "MICLK_MOUSE", "MICLK_RAT", "MILK2_RAT", "MILK2_MOUSE", "MILK1_RAT", "MILK1_MOUSE", "MICA3_MOUSE", "MICA2_RAT", "MCA3A_DANRE", "MICA2_MOUSE", "EHBP1_MOUSE", "EH1L1_MOUSE", "SPTB2_MOUSE", "MCA2B_DANRE", "SPTN2_RAT", "MICA2_XENTR", "SPTCB_DROME", "ACTN_DROME", "SPTB1_MOUSE", "ACTN2_MOUSE", "ACTN3_MOUSE", "ACTN2_CHICK", "ACTN1_RAT", "ACTN1_CHICK", "ACTN1_MOUSE", "CYTSA_CHICK", "MCA3B_DANRE", "CYTSA_CANFA", "CYTSA_DANRE", "CYTSA_MOUSE", "AIN1_SCHPO", "MICA1_DANRE", "CYTSA_RAT", "SYNE2_MOUSE", "ACTN4_CHICK", "ACTN4_MOUSE", "ACTN4_RAT", "CYTSA_XENTR", "CYTSB_MOUSE", "SMTL2_MOUSE", "SMTN_MOUSE", "DYST_MOUSE", "PLEC_RAT", "PLEC_MOUSE", "DMD_CHICK", "DMD_CANFA", "DMD_MOUSE", "MICA1_RAT", "SMTL1_MOUSE", "MICA1_MOUSE", "MACF1_MOUSE", "MACF1_RAT", "DMD_CAEEL", "MILK2_MOUSE", "MILK2_RAT", "MILK1_RAT", "MILK1_MOUSE", "ACTN4_CHICK" "ACTN4_RAT", "ACTN4_MOUSE", "ACTN1_CHICK", "ACTN_DROME", "ACTN1_RAT" "ACTN1_MOUSE", "ACTN3_MOUSE", "SPTCB_DROME", "ACTN2_MOUSE", "ACTN2_CHICK" ] ## using select method timeStart = time.time()
def get_blast_map(resultsFilePath,evalue=0.00001,taxaList=None,asGenes=False,append=False): """ load assembly blast results into dictionary if taxaList is provided then only genes from given taxa will be included in map if asGene == True the results are provided with keys to genes not isoforms """ if not os.path.exists(resultsFilePath): raise Exception("cannot find results file path %s"%resultsFilePath) if taxaList != None: ## prepare database connections session,engine = db_connect() conn = engine.connect() s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(taxaList)) _taxaQueries = conn.execute(s) taxaQueries = _taxaQueries.fetchall() totalQueries = set([]) filteredQueries = set([]) filteredHits = set([]) selectedTaxa = [str(tquery['id']) for tquery in taxaQueries] taxa2name = dict([(str(tquery['id']),str(tquery['ncbi_id'])) for tquery in taxaQueries]) ## create a gene2taxa dictionary #gene2taxa,gene2desc = {},{} #for tquery in taxaQueries: # s = select([Gene.taxa_id,Gene.ncbi_id,Gene.description],Gene.taxa_id==tquery['id']) # _geneQueries = conn.execute(s) # geneQueries = _geneQueries.fetchall() # gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in geneQueries])) # gene2desc.update(dict([(str(r['ncbi_id']),str(r['description'])) for r in geneQueries])) results = {} fid = open(resultsFilePath,'rU') reader = csv.reader(fid) header = reader.next() print header ## loop through file and save best uniqueQueries = set([]) totalQueries = 0 unfilteredQueries = 0 for linja in reader: if len(linja) == 4: queryId = linja[0] hitId = linja[1] hitNcbiId = linja[2] _evalue = float(linja[3]) else: queryId = linja[0] queryNcbi = linja[1] hitId = linja[2] hitNcbiId = linja[3] _evalue = linja[4] if asGenes == True: queryId = re.sub("_i\d+","",queryId) # filtering totalQueries += 1 if '-' in linja: continue if _evalue > evalue: continue if taxaList and gene2taxa.has_key(str(hitNcbiId)) == False: continue unfilteredQueries += 1 uniqueQueries.update([queryId]) ## use the best evalue if not results.has_key(queryId): if append: results[queryId] = [(hitNcbiId,_evalue)] else: results[queryId] = (hitNcbiId,_evalue) if _evalue < results[queryId][1]: if append: results[queryId].append((hitNcbiId,_evalue)) else: results[queryId] = (hitNcbiId,_evalue) uniqueQueries = list(uniqueQueries) print("total queries: %s"%totalQueries) print("unfiltered queries: %s"%unfilteredQueries) print("unique: %s"%len(uniqueQueries)) return results