예제 #1
0
파일: GeneSet.py 프로젝트: xflicsu/htsint
    def __init__(self,verbose=True):
        """
        Constructor
        """

        ## setup db
        self.session, self.engine = db_connect(verbose=verbose)
        self.conn = self.engine.connect()

        ## global variables
        self.verbose = verbose
        self.dpi = 400
        self.labelOffset = 0.05
        self.fontSize = 8 
        self.legendFontSize = 8
        self.increment = 0.03
        self.fontName = 'serif'
        self.termAlpha = 0.95
        self.geneAlpha = 0.4
        self.nodeSizeGene = 300
        self.nodeSizeTerm = 300
        self.colors = ['#000000','#FFCC33','#3333DD',"#002200",'y','b','orange',"#CC55FF","#990033",'#FF6600',"#CCCCCC","#660033",
                       '#FFCC00','#FFFFAA','#6622AA','#33FF77','#998800','#0000FF',"#995599","#00AA00","#777777","#FF0033",'#990066',
                       '#FA58AC','#8A0808','#D8D8D8',"#CC2277",'#336666','#996633',"#FFCCCC","#CC0011","#FFBB33","#DDDDDD","#991188",
                       "#FF9966","#009999","#FF0099","#996633","#990000","#660000","#9900BB","#330033","#FF5544","#9966CC",
                       "#330066","#99FF99","#FF99FF","#333333","#CC3333","#CC9900","#99DD22","#3322BB","#663399","#002255",
                       "#003333","#66CCFF","#CCFFFF","#AA11BB","#000011","#FFCCFF","#00EE33","#337722","#CCBBFF","#FF3300",
                       "#009999","#110000","#AAAAFF","#990000","#880022","#BBBBBB","#00EE88","#66AA22","#99FFEE","#660022",
                       "#FFFF33","#00CCFF","#990066","#006600","#00CCFF",'#AAAAAA',"#33FF00","#0066FF","#FF9900","#FFCC00"]
        self.cmap = plt.cm.Blues
        self.lineEnd = 53
        self.linesMax = 32
예제 #2
0
    def setUp(self):
        """
        connect to the database
        """

        self.session, self.engine = db_connect(upass=UPASS)
        self.conn = self.engine.connect()
        self.testID = '5476'
예제 #3
0
    def __init__(self):
        """
        Constructor
        """

        self.session,self.engine = db_connect()
        self.conn = self.engine.connect()
        self.hits = None
예제 #4
0
    def setUp(self):
        """
        connect to the database
        """

        self.session, self.engine = db_connect(upass=UPASS)
        self.conn = self.engine.connect()
        self.testID = '5476'
예제 #5
0
    def __init__(self):
        """
        Constructor
        """

        self.session, self.engine = db_connect()
        self.conn = self.engine.connect()
        self.hits = None
예제 #6
0
    def __init__(self,taxaInput,delimiter="\t",upass=''):
        """
        The input is a path to a tab delimited file
        or simply a python list
        The delimiter can also be specified
        The first column is the ncbi taxa id
        All additional columns are optional and user specified 
        """

        self.taxaList = []
        self.session,self.engine = db_connect(upass=upass)
        if type(taxaInput) == type([]):
            for taxon in taxaInput:
                if re.search("\D",taxon):
                    raise Exception("Bad taxa in taxaList (%s)"%(taxon))
                    continue
                self.taxaList.append(taxon)
        else:
            self.taxaList = list(set(self.read_file(taxaInput)))

        print("There are %s unique taxa in the list"%(len(self.taxaList)))
예제 #7
0
    def __init__(self,taxaList,verbose=False,upass='',idType='ncbi',useIea=True,\
                 aspect='biological_process'):
        """
        Constructor
        
        taxaList a list of NCBI taxa ids
        """

        ## error checking
        idType = idType.lower()
        if idType not in ['uniprot','ncbi']:
            raise Exception("Invalid idType argument in fetch annotations use 'uniprot' or 'ncbi'")

        ## start a database session
        self.session,self.engine = db_connect(verbose=verbose,upass=upass)

        ## global variables
        self.taxaList = taxaList
        self.idType = idType
        self.useIea = useIea
        self.aspect = aspect
예제 #8
0
    def __init__(self, taxaInput, delimiter="\t", upass=''):
        """
        The input is a path to a tab delimited file
        or simply a python list
        The delimiter can also be specified
        The first column is the ncbi taxa id
        All additional columns are optional and user specified 
        """

        self.taxaList = []
        self.session, self.engine = db_connect(upass=upass)
        if type(taxaInput) == type([]):
            for taxon in taxaInput:
                if re.search("\D", taxon):
                    raise Exception("Bad taxa in taxaList (%s)" % (taxon))
                    continue
                self.taxaList.append(taxon)
        else:
            self.taxaList = list(set(self.read_file(taxaInput)))

        print("There are %s unique taxa in the list" % (len(self.taxaList)))
예제 #9
0
파일: GeneSet.py 프로젝트: changanla/htsint
    def __init__(self, verbose=True):
        """
        Constructor
        """

        ## setup db
        self.session, self.engine = db_connect(verbose=verbose)
        self.conn = self.engine.connect()

        ## global variables
        self.verbose = verbose
        self.dpi = 400
        self.labelOffset = 0.05
        self.fontSize = 8
        self.legendFontSize = 8
        self.increment = 0.03
        self.fontName = 'serif'
        self.termAlpha = 0.95
        self.geneAlpha = 0.4
        self.nodeSizeGene = 300
        self.nodeSizeTerm = 300
        self.colors = [
            '#000000', '#FFCC33', '#3333DD', "#002200", 'y', 'b', 'orange',
            "#CC55FF", "#990033", '#FF6600', "#CCCCCC", "#660033", '#FFCC00',
            '#FFFFAA', '#6622AA', '#33FF77', '#998800', '#0000FF', "#995599",
            "#00AA00", "#777777", "#FF0033", '#990066', '#FA58AC', '#8A0808',
            '#D8D8D8', "#CC2277", '#336666', '#996633', "#FFCCCC", "#CC0011",
            "#FFBB33", "#DDDDDD", "#991188", "#FF9966", "#009999", "#FF0099",
            "#996633", "#990000", "#660000", "#9900BB", "#330033", "#FF5544",
            "#9966CC", "#330066", "#99FF99", "#FF99FF", "#333333", "#CC3333",
            "#CC9900", "#99DD22", "#3322BB", "#663399", "#002255", "#003333",
            "#66CCFF", "#CCFFFF", "#AA11BB", "#000011", "#FFCCFF", "#00EE33",
            "#337722", "#CCBBFF", "#FF3300", "#009999", "#110000", "#AAAAFF",
            "#990000", "#880022", "#BBBBBB", "#00EE88", "#66AA22", "#99FFEE",
            "#660022", "#FFFF33", "#00CCFF", "#990066", "#006600", "#00CCFF",
            '#AAAAAA', "#33FF00", "#0066FF", "#FF9900", "#FFCC00"
        ]
        self.cmap = plt.cm.Blues
        self.lineEnd = 53
        self.linesMax = 32
예제 #10
0
    def __init__(self,taxaList,verbose=False,upass='',idType='ncbi',useIea=True,\
                 aspect='biological_process'):
        """
        Constructor
        
        taxaList a list of NCBI taxa ids
        """

        ## error checking
        idType = idType.lower()
        if idType not in ['uniprot', 'ncbi']:
            raise Exception(
                "Invalid idType argument in fetch annotations use 'uniprot' or 'ncbi'"
            )

        ## start a database session
        self.session, self.engine = db_connect(verbose=verbose, upass=upass)

        ## global variables
        self.taxaList = taxaList
        self.idType = idType
        self.useIea = useIea
        self.aspect = aspect
예제 #11
0
def enrichment_hypergeo(
    termList, entityList, species, useIea=True, asGenes=True, aspect="biological_process", verbose=True
):

    """
    termList -- are the terms to be tested
    species  -- an ncbi taxa id
    entityList -- gene or uniprot ids

    What is the probability of finding a given number of terms if we randomly select N out of M objects?

    M -- genes with at least one annotation
    N -- number of draws or size of gene list
    k -- the number of genes annotated by a given term (total type I objects)
    x -- number of times we observe a term in the gene list (draws)

    in R the cdf can be obtained with
    phyper(x,k,M-k,N)
    hypergeom.pmf(x, M, k, N)
    
    Returns a dict where term id is the key and hypergeo pvalue is the value
    """

    ## connect to db and get annotations for the species
    session, engine = db_connect()
    geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect)

    if asGenes == True:
        entity2go = geneAnnots
    else:
        entity2go = uniprotAnnots

    go2entity = {}
    for entity, go in entity2go.iteritems():
        for term in go:
            if not go2entity.has_key(term):
                go2entity[term] = set([])
            go2entity[term].update([entity])
    for go, entity in go2entity.iteritems():
        go2entity[go] = list(entity)

    print ("total go terms - %s" % (len(go2entity.keys())))
    print ("total entities - %s" % (len(entity2go.keys())))

    ## set variables
    M = len(entity2go.keys())
    N = len(entityList)
    results = {}

    for testTerm in termList:
        ## find
        k = len(go2entity[testTerm])
        x = 0
        for entity in entityList:
            if entity in entity2go and testTerm in entity2go[entity]:
                x += 1

        ## get a p-value
        if 0 in [x, M, N, k]:
            pvalue = np.nan
        else:
            cdf = hypergeom.cdf(x, M, k, N, loc=0)
            if cdf > 0:
                pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N))
            else:
                pvalue = 2 * hypergeom.cdf(x, M, k, N)
        results[testTerm] = pvalue

    return results
예제 #12
0
#!/usr/bin/env python
"""
probe the taxa in the list for annotation coverage
and summary information

"""

import sys, getopt, os

from htsint import TaxaSummary
from htsint.database import db_connect

self.session, self.engine = db_connect()

## get how many genes code for proteins
codingQuery = self.session.query(Uniprot).filter_by(taxa_id=taxaQuery.id).all()
codingGenes = list(set([u.gene_id for u in uniprotQuery]))
remove_empty(codingGenes)

## get number of genes/proteins with at least one annotation
annotatedGenes = list(set([a.gene_id for a in annotations]))
annotatedProts = list(set([a.uniprot_id for a in annotations]))
remove_empty(annotatedGenes)
remove_empty(annotatedProts)

apQuery = [
    self.session.query(Uniprot).filter_by(id=uid).first()
    for uid in annotatedProts
]
#apQuery = self.session.query(Uniprot).filter(Uniprot.id.in_(annotatedProts)).all()
예제 #13
0
def enrichment_hypergeo(termList,
                        entityList,
                        species,
                        useIea=True,
                        asGenes=True,
                        aspect='biological_process',
                        verbose=True):
    '''
    termList -- are the terms to be tested
    species  -- an ncbi taxa id
    entityList -- gene or uniprot ids

    What is the probability of finding a given number of terms if we randomly select N out of M objects?

    M -- genes with at least one annotation
    N -- number of draws or size of gene list
    k -- the number of genes annotated by a given term (total type I objects)
    x -- number of times we observe a term in the gene list (draws)

    in R the cdf can be obtained with
    phyper(x,k,M-k,N)
    hypergeom.pmf(x, M, k, N)
    
    Returns a dict where term id is the key and hypergeo pvalue is the value
    '''

    ## connect to db and get annotations for the species
    session, engine = db_connect()
    geneAnnots, uniprotAnnots = fetch_taxa_annotations([species],
                                                       engine,
                                                       useIea=useIea,
                                                       verbose=verbose,
                                                       aspect=aspect)

    if asGenes == True:
        entity2go = geneAnnots
    else:
        entity2go = uniprotAnnots

    go2entity = {}
    for entity, go in entity2go.items():
        for term in go:
            if term not in go2entity:
                go2entity[term] = set([])
            go2entity[term].update([entity])
    for go, entity in go2entity.items():
        go2entity[go] = list(entity)

    print('total go terms - %s' % (len(go2entity.keys())))
    print('total entities - %s' % (len(entity2go.keys())))

    ## set variables
    M = len(entity2go.keys())
    N = len(entityList)
    results = {}

    for testTerm in termList:
        ## find
        k = len(go2entity[testTerm])
        x = 0
        for entity in entityList:
            if entity in entity2go and testTerm in entity2go[entity]:
                x += 1

        ## get a p-value
        if 0 in [x, M, N, k]:
            pvalue = np.nan
        else:
            cdf = hypergeom.cdf(x, M, k, N, loc=0)
            if cdf > 0:
                pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N))
            else:
                pvalue = 2 * hypergeom.cdf(x, M, k, N)
        results[testTerm] = pvalue

    return results
예제 #14
0
def get_blast_map(resultsFilePath,
                  evalue=0.00001,
                  taxaList=None,
                  asGenes=False,
                  append=False):
    """
    load assembly blast results into dictionary

    if taxaList is provided then only genes from given taxa will be included in map

    if asGene == True the results are provided with keys to genes not isoforms


    """

    if not os.path.exists(resultsFilePath):
        raise Exception("cannot find results file path %s" % resultsFilePath)

    if taxaList != None:
        ## prepare database connections
        session, engine = db_connect()
        conn = engine.connect()
        s = select([Taxon.id, Taxon.ncbi_id,
                    Taxon.name]).where(Taxon.ncbi_id.in_(taxaList))
        _taxaQueries = conn.execute(s)
        taxaQueries = _taxaQueries.fetchall()
        totalQueries = set([])
        filteredQueries = set([])
        filteredHits = set([])
        selectedTaxa = [str(tquery['id']) for tquery in taxaQueries]
        taxa2name = dict([(str(tquery['id']), str(tquery['ncbi_id']))
                          for tquery in taxaQueries])

        ## create a gene2taxa dictionary
        #gene2taxa,gene2desc = {},{}
        #for tquery in taxaQueries:
        #    s = select([Gene.taxa_id,Gene.ncbi_id,Gene.description],Gene.taxa_id==tquery['id'])
        #    _geneQueries = conn.execute(s)
        #    geneQueries = _geneQueries.fetchall()
        #    gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in geneQueries]))
        #    gene2desc.update(dict([(str(r['ncbi_id']),str(r['description'])) for r in geneQueries]))

    results = {}
    fid = open(resultsFilePath, 'rU')
    reader = csv.reader(fid)
    header = reader.next()
    print(header)

    ## loop through file and save best
    uniqueQueries = set([])
    totalQueries = 0
    unfilteredQueries = 0

    for linja in reader:

        if len(linja) == 4:
            queryId = linja[0]
            hitId = linja[1]
            hitNcbiId = linja[2]
            _evalue = float(linja[3])
        else:
            queryId = linja[0]
            queryNcbi = linja[1]
            hitId = linja[2]
            hitNcbiId = linja[3]
            _evalue = linja[4]

        if asGenes == True:
            queryId = re.sub("_i\d+", "", queryId)

        # filtering
        totalQueries += 1
        if '-' in linja:
            continue
        if _evalue > evalue:
            continue

        if taxaList and gene2taxa.has_key(str(hitNcbiId)) == False:
            continue

        unfilteredQueries += 1
        uniqueQueries.update([queryId])

        ## use the best evalue
        if not results.has_key(queryId):
            if append:
                results[queryId] = [(hitNcbiId, _evalue)]
            else:
                results[queryId] = (hitNcbiId, _evalue)
        if _evalue < results[queryId][1]:
            if append:
                results[queryId].append((hitNcbiId, _evalue))
            else:
                results[queryId] = (hitNcbiId, _evalue)

    uniqueQueries = list(uniqueQueries)
    fid.close()
    print("total queries: %s" % totalQueries)
    print("unfiltered queries: %s" % unfilteredQueries)
    print("unique: %s" % len(uniqueQueries))

    return results
예제 #15
0
def create_blast_map(refTaxon,
                     taxaList,
                     resultsFilePath,
                     evalue=0.00001,
                     verbose=False):
    """
    read a summarized reference blast results file and create a map
    results are gene centric

    example results file looks like this
    query(refseq),query(geneId),hit(uniprotEntry),hit(geneEntry),e-value
    NP_001016845.1,549599,AQP3_HUMAN,360,1.24637e-170
    NP_001016845.1,549599,AQP9_HUMAN,366,7.57313e-92
    NP_001016845.1,549599,AQP10_HUMAN,89872,2.87154e-85
    NP_001016845.1,549599,AQP7_HUMAN,364,8.01308e-84
    NP_001016845.1,549599,AQP5_HUMAN,362,4.267e-15

    """

    ## error check
    if not os.path.exists(resultsFilePath):
        raise Exception("cannot find results file path %s" % resultsFilePath)

    if refTaxon not in taxaList:
        raise Exception("refTaxon must be in taxaList")

    ## prepare database connections
    session, engine = db_connect()
    conn = engine.connect()

    ## read through the file to map the genes to taxa ids
    s = select([Taxon.id, Taxon.ncbi_id,
                Taxon.name]).where(Taxon.ncbi_id.in_(taxaList))
    _taxaQueries = conn.execute(s)
    taxaQueries = _taxaQueries.fetchall()
    totalQueries = set([])
    filteredQueries = set([])
    filteredHits = set([])
    selectedTaxa = [str(tquery['id']) for tquery in taxaQueries]
    taxa2name = dict([(str(tquery['id']), str(tquery['ncbi_id']))
                      for tquery in taxaQueries])

    ## create a gene2taxa dictionary
    gene2taxa = {}
    for tquery in taxaQueries:
        s = select([Gene.taxa_id, Gene.ncbi_id], Gene.taxa_id == tquery['id'])
        _geneQueries = conn.execute(s)
        gene2taxa.update(
            dict([(str(r['ncbi_id']), str(r['taxa_id']))
                  for r in _geneQueries.fetchall()]))

    ## creats a dictionary results['geneId]['taxaId'] = bestHitMappedTaxa
    results = {}
    fid = open(resultsFilePath, 'rU')
    reader = csv.reader(fid)
    header = reader.next()

    ## loop through file and save best
    for linja in reader:
        _evalue = float(linja[4])
        totalQueries.update([linja[1]])

        ## filter by species and evalue
        if not gene2taxa.has_key(linja[3]) or _evalue > evalue:
            continue
        if linja[1] == '-' or linja[3] == '-':
            continue

        ## filter self matches
        if taxa2name[gene2taxa[linja[3]]] == refTaxon:
            continue
        if taxa2name[gene2taxa[linja[1]]] != refTaxon:
            raise Exception("Invalid query or invalid refTaxon %s != %s" %
                            (taxa2name[gene2taxa[linja[1]]], refTaxon))

        taxId = taxa2name[gene2taxa[linja[3]]]
        filteredQueries.update([linja[1]])
        filteredHits.update([linja[3]])

        if not results.has_key(taxId):
            results[taxId] = {}

        ## use the best evalue
        if not results[taxId].has_key(linja[1]):
            results[taxId][linja[1]] = (linja[3], _evalue)
        if _evalue < results[taxId][linja[1]][1]:
            results[taxId][linja[1]] = (linja[3], _evalue)

    fid.close()

    ## returns a simplified form of the results as two mappers
    mapper1, mapper2 = {}, {}
    test1 = set([])
    for taxId in results.iterkeys():
        for queryGene, hit in results[taxId].iteritems():
            if not mapper1.has_key(hit[0]):
                mapper1[hit[0]] = []
            mapper1[hit[0]].append(queryGene)

            if not mapper2.has_key(queryGene):
                mapper2[queryGene] = []
            mapper2[queryGene].append(hit[0])

    #debug = set([])
    #for key, item in mapper.iteritems():
    #    debug.update(item)

    #for tquery in taxaQueries:
    #    taxId = str(tquery['ncbi_id'])
    #    for key,item in results[taxId].iteritems():
    #        debug.update([key])
    #        mapper[item[0]] = key

    #debug = list(debug)
    #print 'debug', len(debug),missing
    #print results.keys(),len(results['8364'].keys()), len(results['8355'].keys()), len(list(set(results['8364'].keys() + results['8355'].keys())))
    print('BLAST: total queries: %s' % (len(list(totalQueries))))
    print('BLAST: filtered queries (evalue=%s)(taxa=%s): %s' %
          (evalue, str(taxaList), len(list(filteredQueries))))
    print('BLAST: filtered hits: %s' % (len(list(filteredHits))))
    return mapper1, mapper2
예제 #16
0
import time,sys
from htsint.database import db_connect,Gene,GoAnnotation,GoTerm
from htsint.database import fetch_annotations,gene_mapper


## variables
session,engine = db_connect()
geneList = ['30970']#,'30971','30972','30973','30975']
expEvidCodes = ["EXP","IDA","IPI","IMP","IGI","IEP"]
compEvidCodes = ["ISS","ISO","ISA","ISM","IGC","RCA"]
statEvidCodes = ["TAS","NAS","IC"]
nonCuratedEvidCodes = ["IEA"]
acceptedCodes = expEvidCodes + statEvidCodes
annotations = {}
aspect = 'biological_process'

timeStart = time.time()
geneQueries = session.query(Gene).filter(Gene.ncbi_id.in_(geneList)).all()
print("...extraction time 1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))

#timeStart = time.time()
#geneIdMap = gene_mapper(session,ncbiIdList=geneList)
#print("...extraction time 1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))

## get all results
timeStart = time.time()
for geneQuery in geneQueries:
    annotations[geneQuery.ncbi_id] = set([])
    print geneQuery.ncbi_id
    annotations[geneQuery.ncbi_id].update(session.query(GoAnnotation).filter_by(gene_id=geneQuery.id).all())
print("...extraction q1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
예제 #17
0
def create_blast_map(refTaxon,taxaList,resultsFilePath,evalue=0.00001,verbose=False):
    """
    read a summarized reference blast results file and create a map
    results are gene centric

    example results file looks like this
    query(refseq),query(geneId),hit(uniprotEntry),hit(geneEntry),e-value
    NP_001016845.1,549599,AQP3_HUMAN,360,1.24637e-170
    NP_001016845.1,549599,AQP9_HUMAN,366,7.57313e-92
    NP_001016845.1,549599,AQP10_HUMAN,89872,2.87154e-85
    NP_001016845.1,549599,AQP7_HUMAN,364,8.01308e-84
    NP_001016845.1,549599,AQP5_HUMAN,362,4.267e-15

    """

    ## error check
    if not os.path.exists(resultsFilePath):
        raise Exception("cannot find results file path %s"%resultsFilePath)

    if refTaxon not in taxaList:
        raise Exception("refTaxon must be in taxaList")

    ## prepare database connections
    session,engine = db_connect()
    conn = engine.connect()

    ## read through the file to map the genes to taxa ids    
    s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(taxaList))
    _taxaQueries = conn.execute(s)
    taxaQueries = _taxaQueries.fetchall()
    totalQueries = set([])
    filteredQueries = set([])
    filteredHits = set([])
    selectedTaxa = [str(tquery['id']) for tquery in taxaQueries]
    taxa2name = dict([(str(tquery['id']),str(tquery['ncbi_id'])) for tquery in taxaQueries])

    ## create a gene2taxa dictionary
    gene2taxa = {}
    for tquery in taxaQueries:
        s = select([Gene.taxa_id,Gene.ncbi_id],Gene.taxa_id==tquery['id'])
        _geneQueries = conn.execute(s)
        gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in _geneQueries.fetchall()]))

    ## creats a dictionary results['geneId]['taxaId'] = bestHitMappedTaxa
    results = {}
    fid = open(resultsFilePath,'rU')
    reader = csv.reader(fid)
    header = reader.next()
        
    ## loop through file and save best
    for linja in reader:
        _evalue = float(linja[4])
        totalQueries.update([linja[1]])

        ## filter by species and evalue
        if not gene2taxa.has_key(linja[3]) or _evalue > evalue:
            continue
        if linja[1] == '-' or linja[3] == '-':
            continue

        ## filter self matches
        if taxa2name[gene2taxa[linja[3]]] == refTaxon:
            continue
        if taxa2name[gene2taxa[linja[1]]] != refTaxon:
            raise Exception("Invalid query or invalid refTaxon %s != %s"%(taxa2name[gene2taxa[linja[1]]],refTaxon))
    
        taxId = taxa2name[gene2taxa[linja[3]]]
        filteredQueries.update([linja[1]])
        filteredHits.update([linja[3]])
        
        if not results.has_key(taxId):
            results[taxId] = {}
        
        ## use the best evalue
        if not results[taxId].has_key(linja[1]):
            results[taxId][linja[1]] = (linja[3],_evalue)
        if _evalue < results[taxId][linja[1]][1]:
            results[taxId][linja[1]] = (linja[3],_evalue)

    fid.close()
    
    ## returns a simplified form of the results as two mappers
    mapper1,mapper2 = {},{}
    test1 = set([])
    for taxId in results.iterkeys(): 
        for queryGene,hit in results[taxId].iteritems():
            if not mapper1.has_key(hit[0]):
                mapper1[hit[0]] = []
            mapper1[hit[0]].append(queryGene)
            
            if not mapper2.has_key(queryGene):
                mapper2[queryGene] = []
            mapper2[queryGene].append(hit[0])

    #debug = set([])
    #for key, item in mapper.iteritems():
    #    debug.update(item)
    
    #for tquery in taxaQueries:
    #    taxId = str(tquery['ncbi_id'])
    #    for key,item in results[taxId].iteritems():
    #        debug.update([key])
    #        mapper[item[0]] = key

    #debug = list(debug)
    #print 'debug', len(debug),missing
    #print results.keys(),len(results['8364'].keys()), len(results['8355'].keys()), len(list(set(results['8364'].keys() + results['8355'].keys())))
    print('BLAST: total queries: %s'%(len(list(totalQueries))))
    print('BLAST: filtered queries (evalue=%s)(taxa=%s): %s'%(evalue,str(taxaList),len(list(filteredQueries))))
    print('BLAST: filtered hits: %s'%(len(list(filteredHits))))
    return mapper1,mapper2
예제 #18
0
#!/usr/bin/python

import time
from sqlalchemy.sql import select
from htsint.database import db_connect, Taxon, Gene, Uniprot, Refseq
from htsint.database import uniprot_mapper

session, engine = db_connect()
conn = engine.connect()

uniprotEntries = [
    "KCNQ4_MOUSE", "CSMT1_XENTR", "CSMT1_MOUSE", "MILK2_MOUSE", "MILK2_RAT",
    "MILK1_RAT", "MILK1_MOUSE", "MICA3_MOUSE", "MCA3A_DANRE", "MCA3B_DANRE",
    "MICA1_DANRE", "MCA2B_DANRE", "MICLK_MOUSE", "MICLK_RAT", "MILK2_RAT",
    "MILK2_MOUSE", "MILK1_RAT", "MILK1_MOUSE", "MICA3_MOUSE", "MICA2_RAT",
    "MCA3A_DANRE", "MICA2_MOUSE", "EHBP1_MOUSE", "EH1L1_MOUSE", "SPTB2_MOUSE",
    "MCA2B_DANRE", "SPTN2_RAT", "MICA2_XENTR", "SPTCB_DROME", "ACTN_DROME",
    "SPTB1_MOUSE", "ACTN2_MOUSE", "ACTN3_MOUSE", "ACTN2_CHICK", "ACTN1_RAT",
    "ACTN1_CHICK", "ACTN1_MOUSE", "CYTSA_CHICK", "MCA3B_DANRE", "CYTSA_CANFA",
    "CYTSA_DANRE", "CYTSA_MOUSE", "AIN1_SCHPO", "MICA1_DANRE", "CYTSA_RAT",
    "SYNE2_MOUSE", "ACTN4_CHICK", "ACTN4_MOUSE", "ACTN4_RAT", "CYTSA_XENTR",
    "CYTSB_MOUSE", "SMTL2_MOUSE", "SMTN_MOUSE", "DYST_MOUSE", "PLEC_RAT",
    "PLEC_MOUSE", "DMD_CHICK", "DMD_CANFA", "DMD_MOUSE", "MICA1_RAT",
    "SMTL1_MOUSE", "MICA1_MOUSE", "MACF1_MOUSE", "MACF1_RAT", "DMD_CAEEL",
    "MILK2_MOUSE", "MILK2_RAT", "MILK1_RAT", "MILK1_MOUSE", "ACTN4_CHICK"
    "ACTN4_RAT", "ACTN4_MOUSE", "ACTN1_CHICK", "ACTN_DROME", "ACTN1_RAT"
    "ACTN1_MOUSE", "ACTN3_MOUSE", "SPTCB_DROME", "ACTN2_MOUSE", "ACTN2_CHICK"
]

## using select method
timeStart = time.time()
예제 #19
0
def get_blast_map(resultsFilePath,evalue=0.00001,taxaList=None,asGenes=False,append=False):
    """
    load assembly blast results into dictionary

    if taxaList is provided then only genes from given taxa will be included in map

    if asGene == True the results are provided with keys to genes not isoforms


    """

    if not os.path.exists(resultsFilePath):
        raise Exception("cannot find results file path %s"%resultsFilePath)

    if taxaList != None:
        ## prepare database connections
        session,engine = db_connect()
        conn = engine.connect()
        s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(taxaList))
        _taxaQueries = conn.execute(s)
        taxaQueries = _taxaQueries.fetchall()
        totalQueries = set([])
        filteredQueries = set([])
        filteredHits = set([])
        selectedTaxa = [str(tquery['id']) for tquery in taxaQueries]
        taxa2name = dict([(str(tquery['id']),str(tquery['ncbi_id'])) for tquery in taxaQueries])
        
        ## create a gene2taxa dictionary
        #gene2taxa,gene2desc = {},{}
        #for tquery in taxaQueries:
        #    s = select([Gene.taxa_id,Gene.ncbi_id,Gene.description],Gene.taxa_id==tquery['id'])
        #    _geneQueries = conn.execute(s)
        #    geneQueries = _geneQueries.fetchall()
        #    gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in geneQueries]))
        #    gene2desc.update(dict([(str(r['ncbi_id']),str(r['description'])) for r in geneQueries]))

    results = {}
    fid = open(resultsFilePath,'rU')
    reader = csv.reader(fid)
    header = reader.next()
    print header

    ## loop through file and save best
    uniqueQueries = set([])
    totalQueries = 0
    unfilteredQueries = 0

    for linja in reader:
        
        if len(linja) == 4:
            queryId = linja[0]
            hitId = linja[1]
            hitNcbiId = linja[2]
            _evalue = float(linja[3])
        else:
            queryId = linja[0]
            queryNcbi = linja[1]
            hitId = linja[2]
            hitNcbiId = linja[3]
            _evalue = linja[4]

        if asGenes == True:
            queryId = re.sub("_i\d+","",queryId)

        # filtering
        totalQueries += 1
        if '-' in linja:
            continue
        if _evalue > evalue:
            continue

        if taxaList and gene2taxa.has_key(str(hitNcbiId)) == False:
            continue

        unfilteredQueries += 1
        uniqueQueries.update([queryId])
        
        ## use the best evalue
        if not results.has_key(queryId):
            if append:
                results[queryId] = [(hitNcbiId,_evalue)]
            else:
                results[queryId] = (hitNcbiId,_evalue)
        if _evalue < results[queryId][1]:
            if append:
                results[queryId].append((hitNcbiId,_evalue))
            else:
                results[queryId] = (hitNcbiId,_evalue)
                
    uniqueQueries = list(uniqueQueries)
    
    print("total queries: %s"%totalQueries)
    print("unfiltered queries: %s"%unfilteredQueries)
    print("unique: %s"%len(uniqueQueries))
        
    return results