def abstracts(request): """Return a list of abstracts (as HTML wrapped in JSON) for a keyword query and list of genes.""" # get species, default=human try: species = int(request.GET['species']) except (KeyError, ValueError): species = 9606 # get gene operator (any (or) / all (and)) try: geneop = request.GET['geneop'].lower() if geneop == 'all': implicitOr = False else: geneop = 'any' implicitOr = True except KeyError: geneop = 'any' implicitOr = True # figure out if we should include homologs try: usehomologs = parseboolean(request.GET['usehomologs']) except (KeyError, ValueError): usehomologs = False # get keyword arguments from query string keywords = request.GET.get('q') # get genes from query string try: if parseboolean(request.GET.get('usegenefile')): # look up genes from file if we're using one genefileID = request.GET.get('genefileID', -1) genes = genefile_lookup(genefileID, implicitOr, usehomologs) elif request.GET.get('genes'): gene_query = request.GET.get('genes') genes = parse_gene_abstractquery(gene_query, species, implicitOr, usehomologs) else: genes = NullQuery if request.GET.get('rowgene'): genes = addgene(genes, request.GET.get('rowgene'), species, usehomologs) # apply gene filter if request.GET.get('genefilter'): genes = addgene(genes, request.GET.get('genefilter'), species, usehomologs) except LookupError as e: # bad gene query response = HttpResponse() json.dump({'validresult': False, 'errmsg': 'Bad gene query. Check your gene symbols: {0}.'.format(e.args[0])}, response) return response except BadGenefileError: response = HttpResponse() json.dump({'validresult': False, 'errmsg': "Can't find this gene file! It probably expired. Please upload it again."}) return response # should we only include reviews? try: onlyreviews = parseboolean(request.GET['onlyreviews']) except (KeyError, ValueError): onlyreviews = False # error if no query if not keywords and not genes: response = HttpResponse() json.dump({'validresult': False, 'errmsg': 'You must supply either genes or a query'}, response) return response # get sorting parameter orderby = request.GET.get('orderby') if orderby: orderby = orderby.lower() if orderby not in (None, 'relevance', 'oldest', 'newest'): response = HttpResponse() json.dump({'validresult': False, 'errmsg': 'Invalid "orderby." Valid options are None, "relevance", "oldest", or "newest".'}, response) return response # get limit and offset try: offset = int(request.GET.get('offset')) except: offset = 0 try: limit = int(request.GET.get('limit')) except: limit = None # get keyword ID from query string keywordID = request.GET.get('keywordnum') if keywordID: keyword_abstracts = [a.pubmed_id for a in Abstract.objects.filter(ka_abstract__keyphrase=keywordID).only('pubmed_id')] else: keyword_abstracts = None # get optional metabolite ID metabolite = request.GET.get('metabolite') # get abstract ID's from index abstracts = abstracts_page(keywords, genes, usehomologs, limit, offset, orderby, onlyreviews, keyword_abstracts, metabolite) # error if no abstracts if not abstracts: response = HttpResponse() json.dump({'validresult': False, 'errmsg': 'No more abstracts!'}, response) return response # create response resulthtml = render_to_string('abstracts.html', {'abstracts': abstracts}) response = HttpResponse() json.dump({'validresult': True, 'result': resulthtml}, response) return response
def genesearch(request): """Does the actual search for the gene search. Given a keyword query, a list of genes, species, homology option, offset, limit, sorting criterion, and response type (all via the query string), fetches a list of genes relevent to the query via the index and database, and returns the appropriate response.""" params = searchparams(request) # use homology option to decide which gene-abstract table and which # abstract-count column to use. if params.usehomologs: geneabstract_tablename = 'homologene_gene_abstract' abstract_col = 'homolog_abstracts' else: geneabstract_tablename = 'gene_abstract' abstract_col = 'abstracts' if params.genes or params.usegenefile: try: # get a query to run against the abstract index if params.usegenefile: genequery = genefile_lookup(params.genefileID, implicitOr=params.implicitOr, usehomologs=params.usehomologs) else: genequery = parse_gene_abstractquery(q=params.genes, tax=params.species, implicitOr=params.implicitOr, usehomologs=params.usehomologs) except LookupError as e: # a term in the gene query couldn't be matched to any genes. return searchresponse(validresult=False, download=params.download, errmsg='No genes match <b>{0}</b> for species {1}'.format(e.args[0], params.species)) except BadGenefileError: return searchresponse(validresult=False, download=params.download, errmsg="Can't find this gene file! It probably expired. Please upload it again.""") else: genequery = None # don't do anything if we don't have a query if not genequery and not params.keywords: return searchresponse(validresult=False, download=params.download, errmsg="Please enter gene symbols or a keyword query.") # get abstracts matching keywords and genes abstracts = get_abstracts(params.keywords, genequery, params.usehomologs) query_abstract_count = len(abstracts) # error if no abstracts matched the query if abstracts == []: return searchresponse(validresult=False, download=params.download, errmsg="Your query did not match any abstracts.", query=params.keywords, genes=params.genes, usehomologs=params.usehomologs, usegenefile=params.usegenefile) # get corpus size total_abstract_count = corpus_size() if params.orderby in query_orderbys: query_orderby = query_orderbys[params.orderby] # orderby term to insert into SQL else: query_orderby = params.orderby = 'f1_score' def paramstring(l): """Return a string of comma-separated %s's of length l (faster and more memory-efficient than using a list comprehension)""" def slist(): for i in xrange(l): yield "%s" return ','.join(slist()) # build SQL query for fetching genes sqlquery = """ SELECT g.*, `{abstract_col}` `abstracts_display`, COUNT(*) hits, COUNT(*)/ (`{abstract_col}` + 10) `precision`, (2 * (COUNT(*) / `{abstract_col}`) * (COUNT(*) / {query_abstract_count})) / ((COUNT(*) / `{abstract_col}`) + (COUNT(*) / {query_abstract_count})) f1_score FROM `{geneabstract_tablename}` a INNER JOIN `gene` g ON g.entrez_id = a.gene WHERE a.`abstract` in ({paramstring}) AND g.`tax_id` = %s GROUP BY g.entrez_id ORDER BY `{orderby}` DESC LIMIT %s, %s; """.format( paramstring=paramstring(len(abstracts)), orderby=query_orderby, query_abstract_count=query_abstract_count, species=params.species, geneabstract_tablename=geneabstract_tablename, abstract_col=abstract_col) # execute sql query, get genes results = Gene.objects.raw(sqlquery, abstracts + [params.species, params.offset, params.query_limit]) # calculate p values # '{0:.2e}'.format() phyper = robjects.r['phyper'] pvals_float = [phyper(g.hits-1, query_abstract_count, total_abstract_count-query_abstract_count, g.abstracts_display, lower_tail=False)[0] for g in results] pvals = [('{0:.2e}'.format(p) if p > 0.0000000001 else '< 1e-10') for p in pvals_float] if not pvals: return searchresponse(validresult=False, download=params.download, errmsg="Your query didn't match any genes.", query=params.keywords, genes=params.genes, usehomologs=params.usehomologs, species=params.species, usegenefile=params.usegenefile) return searchresponse(validresult=True, download=params.download, results=results, genes=params.genes, geneop=params.geneop, pvals=pvals, offset=params.offset, orderby=params.orderby, query=params.keywords, limit=params.limit, usehomologs=params.usehomologs, species=params.species, query_abstract_count=query_abstract_count, abstracts=abstracts, usegenefile=params.usegenefile)