Exemplo n.º 1
0
 def testFetchTaxaAnnotations(self):
     """
     test the GoTerm and GoAnnotation tables
     """
 
     print("fetching annotations for taxa")
     geneAnnots,uniprotAnnots = fetch_taxa_annotations([self.testID],self.engine,useIea=False,verbose=True)
     self.assertTrue('GO:0018343' in uniprotAnnots['Q9Y765'])
Exemplo n.º 2
0
    def summarize(self,refTaxon,termsPath):
        """
        GO object summary and sanity check
        """

        refTaxon = str(refTaxon)
        if refTaxon not in self.taxaList:
            raise Exception("refTaxon not present in taxaList")

        conn = self.engine.connect()
        gene2go,go2gene = self.load_dicts(termsPath=termsPath)

        s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(self.taxaList))
        _taxaQueries = conn.execute(s)
        taxaQueries = _taxaQueries.fetchall()
        taxaMap = dict([(str(r['ncbi_id']),str(r['id'])) for r in taxaQueries])

        gene2id = {}
        for tquery in taxaQueries: 
            s = select([Gene.id,Gene.ncbi_id],Gene.taxa_id==tquery['id'])
            _geneQueries = conn.execute(s)
            taxaDict = dict([(str(r['ncbi_id']),str(r['id'])) for r in _geneQueries.fetchall()])
            if str(tquery['ncbi_id']) == refTaxon:
                refGenes = taxaDict.copy()

            print("there are  %s genes from %s (%s)"%(len(taxaDict.keys()),tquery['name'],tquery['ncbi_id']))
            gene2id.update(taxaDict)

        ## check for unmatched genes
        unmatched = 0
        for gene in gene2go.items():
            if gene not in  gene2id:
                unmatched += 1
        
        print("Summary")
        print("IEA annotations: %s"%self.useIea)
        print("total genes in combined taxa: %s"%(len(gene2id.keys())))
        if unmatched > 0:
            print("WARNING: there were unmatched genes unmatched: %s"%unmatched)
        print("total genes with at least one annotation: %s"%(len(gene2go.keys())))
        print("total unique annotations: %s"%(len(go2gene.keys())))
        print("---------------------")

        _gene2go,_prot2go = fetch_taxa_annotations([refTaxon],self.engine,aspect=self.aspect,\
                                                   useIea=self.useIea)

        total= 0
        for k,v in _gene2go.items():
            total += len(v)
        print('RefTaxa genes: %s'%(len(refGenes.keys())))
        print('Only RefTaxa: %s annotated genes, %s total annotations'%(len(_gene2go.keys()), total))
        
        total= 0
        for k,v in gene2go.items():
            total += len(v)
        print('With additional taxa: %s annotated genes, %s total annotations'%(len(gene2go.keys()), total))
        print('Percent annotation: %s'%(float(len(gene2go.keys())) / float(len(refGenes.keys()))))
Exemplo n.º 3
0
    def testFetchTaxaAnnotations(self):
        """
        test the GoTerm and GoAnnotation tables
        """

        print("fetching annotations for taxa")
        geneAnnots, uniprotAnnots = fetch_taxa_annotations([self.testID],
                                                           self.engine,
                                                           useIea=False,
                                                           verbose=True)
        self.assertTrue('GO:0018343' in uniprotAnnots['Q9Y765'])
Exemplo n.º 4
0
    def create_dicts(self, termsPath, accepted=None):
        """
        get the go2gene and gene2go dictionaries
        'accepted' - list of genes that restrict included terms to a particular list
        """

        conn = self.engine.connect()

        ## error checking
        if self.aspect not in [
                'biological_process', 'molecular_function',
                'cellular_component'
        ]:
            raise Exception("Invalid aspect specified%s" % self.aspect)

        ## gene2go
        print(
            "...creating gene2go dictionary -- this may take several minutes or longer depending on the number of genes"
        )
        _gene2go,prot2go = fetch_taxa_annotations(self.taxaList,self.engine,aspect=self.aspect,\
                                                 useIea=self.useIea)

        print(
            "...creating go2gene dictionary -- this may take several minutes")
        go2gene = {}
        gene2go = {}
        for gene, terms in _gene2go.iteritems():
            if accepted and gene not in accepted:
                continue

            gene2go[gene] = terms
            for term in terms:
                if go2gene.has_key(term) == False:
                    go2gene[term] = set([])
                go2gene[term].update([gene])

        for term, genes in go2gene.iteritems():
            go2gene[term] = list(genes)

        ## pickle the dictionaries
        tmp = open(termsPath, 'w')
        cPickle.dump([gene2go, go2gene], tmp)
        tmp.close()
Exemplo n.º 5
0
    def create_dicts(self,termsPath,accepted=None):
        """
        get the go2gene and gene2go dictionaries
        'accepted' - list of genes that restrict included terms to a particular list
        """

        conn = self.engine.connect()

        ## error checking
        if self.aspect not in ['biological_process','molecular_function','cellular_component']:
            raise Exception("Invalid aspect specified%s"%self.aspect)

        ## gene2go
        print("...creating gene2go dictionary -- this may take several minutes or longer depending on the number of genes")
        _gene2go,prot2go = fetch_taxa_annotations(self.taxaList,self.engine,aspect=self.aspect,\
                                                 useIea=self.useIea)

        print("...creating go2gene dictionary -- this may take several minutes")
        go2gene = {}
        gene2go = {}
        for gene,terms in _gene2go.items():
            if accepted and gene not in accepted:
                continue

            gene2go[gene] = terms
            for term in terms:
                if term not in go2gene:
                    go2gene[term] = set([])
                go2gene[term].update([gene])

        for term,genes in go2gene.items():
            go2gene[term] = list(genes)
        
        ## pickle the dictionaries    
        tmp = open(termsPath,'wb')
        pickle.dump([gene2go,go2gene],tmp)
        tmp.close()
Exemplo n.º 6
0
    def summarize(self, refTaxon, termsPath):
        """
        GO object summary and sanity check
        """

        refTaxon = str(refTaxon)
        if refTaxon not in self.taxaList:
            raise Exception("refTaxon not present in taxaList")

        conn = self.engine.connect()
        gene2go, go2gene = self.load_dicts(termsPath=termsPath)

        s = select([Taxon.id, Taxon.ncbi_id,
                    Taxon.name]).where(Taxon.ncbi_id.in_(self.taxaList))
        _taxaQueries = conn.execute(s)
        taxaQueries = _taxaQueries.fetchall()
        taxaMap = dict([(str(r['ncbi_id']), str(r['id']))
                        for r in taxaQueries])

        gene2id = {}
        for tquery in taxaQueries:
            s = select([Gene.id, Gene.ncbi_id], Gene.taxa_id == tquery['id'])
            _geneQueries = conn.execute(s)
            taxaDict = dict([(str(r['ncbi_id']), str(r['id']))
                             for r in _geneQueries.fetchall()])
            if str(tquery['ncbi_id']) == refTaxon:
                refGenes = taxaDict.copy()

            print("there are  %s genes from %s (%s)" %
                  (len(taxaDict.keys()), tquery['name'], tquery['ncbi_id']))
            gene2id.update(taxaDict)

        ## check for unmatched genes
        unmatched = 0
        for gene in gene2go.iterkeys():
            if not gene2id.has_key(gene):
                unmatched += 1

        print("Summary")
        print("IEA annotations: %s" % self.useIea)
        print("total genes in combined taxa: %s" % (len(gene2id.keys())))
        if unmatched > 0:
            print("WARNING: there were unmatched genes unmatched: %s" %
                  unmatched)
        print("total genes with at least one annotation: %s" %
              (len(gene2go.keys())))
        print("total unique annotations: %s" % (len(go2gene.keys())))
        print("---------------------")

        _gene2go,_prot2go = fetch_taxa_annotations([refTaxon],self.engine,aspect=self.aspect,\
                                                   useIea=self.useIea)

        total = 0
        for k, v in _gene2go.iteritems():
            total += len(v)
        print('RefTaxa genes: %s' % (len(refGenes.keys())))
        print('Only RefTaxa: %s annotated genes, %s total annotations' %
              (len(_gene2go.keys()), total))

        total = 0
        for k, v in gene2go.iteritems():
            total += len(v)
        print(
            'With additional taxa: %s annotated genes, %s total annotations' %
            (len(gene2go.keys()), total))
        print('Percent annotation: %s' %
              (float(len(gene2go.keys())) / float(len(refGenes.keys()))))
Exemplo n.º 7
0
def enrichment_hypergeo(
    termList, entityList, species, useIea=True, asGenes=True, aspect="biological_process", verbose=True
):

    """
    termList -- are the terms to be tested
    species  -- an ncbi taxa id
    entityList -- gene or uniprot ids

    What is the probability of finding a given number of terms if we randomly select N out of M objects?

    M -- genes with at least one annotation
    N -- number of draws or size of gene list
    k -- the number of genes annotated by a given term (total type I objects)
    x -- number of times we observe a term in the gene list (draws)

    in R the cdf can be obtained with
    phyper(x,k,M-k,N)
    hypergeom.pmf(x, M, k, N)
    
    Returns a dict where term id is the key and hypergeo pvalue is the value
    """

    ## connect to db and get annotations for the species
    session, engine = db_connect()
    geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect)

    if asGenes == True:
        entity2go = geneAnnots
    else:
        entity2go = uniprotAnnots

    go2entity = {}
    for entity, go in entity2go.iteritems():
        for term in go:
            if not go2entity.has_key(term):
                go2entity[term] = set([])
            go2entity[term].update([entity])
    for go, entity in go2entity.iteritems():
        go2entity[go] = list(entity)

    print ("total go terms - %s" % (len(go2entity.keys())))
    print ("total entities - %s" % (len(entity2go.keys())))

    ## set variables
    M = len(entity2go.keys())
    N = len(entityList)
    results = {}

    for testTerm in termList:
        ## find
        k = len(go2entity[testTerm])
        x = 0
        for entity in entityList:
            if entity in entity2go and testTerm in entity2go[entity]:
                x += 1

        ## get a p-value
        if 0 in [x, M, N, k]:
            pvalue = np.nan
        else:
            cdf = hypergeom.cdf(x, M, k, N, loc=0)
            if cdf > 0:
                pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N))
            else:
                pvalue = 2 * hypergeom.cdf(x, M, k, N)
        results[testTerm] = pvalue

    return results
Exemplo n.º 8
0
import sys,time
from sqlalchemy.sql import select
from htsint.database import db_connect,fetch_annotations,fetch_taxa_annotations
from htsint.database import Taxon,taxa_mapper,Gene,gene_mapper 
session,engine = db_connect()
conn = engine.connect()


#timeStart = time.time()
#annotations = fetch_annotations(['31251'],engine,idType='ncbi',useIea=False,aspect='biological_process')
#print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
#print annotations

##7091(small), 7227(large)
timeStart = time.time()
annotations,goTerms = fetch_taxa_annotations(['7227'],engine,idType='ncbi',useIea=False,aspect='biological_process')
print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
#print annotations



sys.exit()



###########
widget = Gene#Taxon
print("scanning %s"%widget.__tablename__)

timeStart = time.time()
myDict = {}
Exemplo n.º 9
0
def enrichment_hypergeo(termList,
                        entityList,
                        species,
                        useIea=True,
                        asGenes=True,
                        aspect='biological_process',
                        verbose=True):
    '''
    termList -- are the terms to be tested
    species  -- an ncbi taxa id
    entityList -- gene or uniprot ids

    What is the probability of finding a given number of terms if we randomly select N out of M objects?

    M -- genes with at least one annotation
    N -- number of draws or size of gene list
    k -- the number of genes annotated by a given term (total type I objects)
    x -- number of times we observe a term in the gene list (draws)

    in R the cdf can be obtained with
    phyper(x,k,M-k,N)
    hypergeom.pmf(x, M, k, N)
    
    Returns a dict where term id is the key and hypergeo pvalue is the value
    '''

    ## connect to db and get annotations for the species
    session, engine = db_connect()
    geneAnnots, uniprotAnnots = fetch_taxa_annotations([species],
                                                       engine,
                                                       useIea=useIea,
                                                       verbose=verbose,
                                                       aspect=aspect)

    if asGenes == True:
        entity2go = geneAnnots
    else:
        entity2go = uniprotAnnots

    go2entity = {}
    for entity, go in entity2go.items():
        for term in go:
            if term not in go2entity:
                go2entity[term] = set([])
            go2entity[term].update([entity])
    for go, entity in go2entity.items():
        go2entity[go] = list(entity)

    print('total go terms - %s' % (len(go2entity.keys())))
    print('total entities - %s' % (len(entity2go.keys())))

    ## set variables
    M = len(entity2go.keys())
    N = len(entityList)
    results = {}

    for testTerm in termList:
        ## find
        k = len(go2entity[testTerm])
        x = 0
        for entity in entityList:
            if entity in entity2go and testTerm in entity2go[entity]:
                x += 1

        ## get a p-value
        if 0 in [x, M, N, k]:
            pvalue = np.nan
        else:
            cdf = hypergeom.cdf(x, M, k, N, loc=0)
            if cdf > 0:
                pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N))
            else:
                pvalue = 2 * hypergeom.cdf(x, M, k, N)
        results[testTerm] = pvalue

    return results
Exemplo n.º 10
0
from sqlalchemy.sql import select
from htsint.database import db_connect, fetch_annotations, fetch_taxa_annotations
from htsint.database import Taxon, taxa_mapper, Gene, gene_mapper
session, engine = db_connect()
conn = engine.connect()

#timeStart = time.time()
#annotations = fetch_annotations(['31251'],engine,idType='ncbi',useIea=False,aspect='biological_process')
#print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
#print annotations

##7091(small), 7227(large)
timeStart = time.time()
annotations, goTerms = fetch_taxa_annotations(['7227'],
                                              engine,
                                              idType='ncbi',
                                              useIea=False,
                                              aspect='biological_process')
print("end: %s" %
      time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart)))
#print annotations

sys.exit()

###########
widget = Gene  #Taxon
print("scanning %s" % widget.__tablename__)

timeStart = time.time()
myDict = {}
s = select([widget.id, widget.ncbi_id])