def association_stats(request, sources=None): ''' Get association statistics for a given marker ID. ''' seqid = request.GET.get('chr').replace('chr', '') idx_type = request.GET.get('idx_type').upper() start = request.GET.get('start') end = request.GET.get('end') data = [] def get_stats(resp_json): hits = resp_json['hits']['hits'] for hit in hits: d = Document(hit) data.append({ "CHROM": getattr(d, 'seqid'), "POS": getattr(d, 'position'), "PVALUE": getattr(d, 'p_value'), "DBSNP_ID": getattr(d, 'marker') }) query = ElasticQuery(Query.query_string(seqid, fields=["seqid"]), sources=sources) if start is not None and end is not None: query = ElasticQuery(BoolQuery(must_arr=[Query.query_string(seqid, fields=["seqid"]), RangeQuery("position", gte=start, lte=end)]), sources=sources) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('IC_STATS', idx_type), call_fun=get_stats, query=query) json = {"variants": data} return JsonResponse(json)
def post(self, request, *args, **kwargs): ens_id = self.request.POST.get('ens_id') marker = self.request.POST.get('marker') markers = self.request.POST.getlist('markers[]') if ens_id: sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) elif marker: sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap()) elif markers: sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500) study_hits = elastic.get_json_response()['hits'] ens_ids = [] pmids = [] for hit in study_hits['hits']: if 'pmid' in hit['_source']: pmids.append(hit['_source']['pmid']) if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal']) for hit in study_hits['hits']: genes = {} if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: try: genes[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genes = {ens_id: ens_id} hit['_source']['genes'] = genes if 'pmid' in hit['_source']: pmid = hit['_source']['pmid'] try: authors = getattr(pub_docs[pmid], 'authors') journal = getattr(pub_docs[pmid], 'journal') hit['_source']['pmid'] = \ {'pmid': pmid, 'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "", 'journal': journal} except KeyError: hit['_source']['pmid'] = {'pmid': pmid} return JsonResponse(study_hits)
def query_string(cls, query_term, sources=None, highlight=None, query_filter=None, **string_opts): ''' Factory method for creating elastic Query String Query. @type query_term: string @param query_term: The string to use in the query. @type sources: array of result fields @keyword sources: The _source filtering to be used (default: None). @type highlight: Highlight @keyword highlight: Define the highlighting of results (default: None). @type query_filter: Filter @keyword query_filter: Optional filter for query. @return: L{ElasticQuery} ''' if query_filter is None: query = Query.query_string(query_term, **string_opts) else: query = FilteredQuery(Query.query_string(query_term, **string_opts), query_filter) return cls(query, sources, highlight)
def test_bool_filtered_query2(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.term("seqid", 1)) query_string = Query.query_string("rs768019142", fields=["id", "seqid"]) query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_function_score_query(self): ''' Test a function score query with a query (using the start position as the score). ''' score_function = ScoreFunction.create_score_function('field_value_factor', field='start', modifier='reciprocal') query_string = Query.query_string("rs*", fields=["id", "seqid"]) query = ElasticQuery(FunctionScoreQuery(query_string, [score_function], boost_mode='replace')) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = 0 for doc in docs: start = getattr(doc, 'start') self.assertLess(last_start, start) last_start = start
def filter_queryset(self, request, queryset, view): ''' Override this method to request feature locations. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) query_str = filters.get('feature', 'PTPN22') build = self._get_build(filters.get('build', settings.DEFAULT_BUILD)) if query_str is None or query_str == '': return [ElasticObject(initial={'error': 'No feature name provided.'})] search_fields = ['id', 'symbol', 'dbxrefs.ensembl', 'region_name'] sources = ['start', 'stop', 'seqid', 'chromosome', 'disease_loci'] idxs = ElasticSettings.getattr('IDX') MARKER_IDX = '' if build == ElasticSettings.get_label('MARKER', label='build'): MARKER_IDX = 'MARKER' if MARKER_IDX == '': for idx in idxs: if 'MARKER' in idx: if build == ElasticSettings.get_label(idx, label='build'): MARKER_IDX = idx (idx, idx_type) = ElasticSettings.idx_names(MARKER_IDX, 'MARKER') (idx_r, idx_type_r) = ElasticSettings.idx_names('REGION', 'REGION') (idx_g, idx_type_g) = ElasticSettings.idx_names('GENE', 'GENE') idx += ',' + idx_r + ',' + idx_g idx_type += ',' + idx_type_r + ',' + idx_type_g equery = BoolQuery(must_arr=Query.query_string(query_str, fields=search_fields)) elastic = Search(search_query=ElasticQuery(equery, sources), size=10, idx=idx, idx_type=idx_type) docs = elastic.search().docs locs = [] for doc in docs: if isinstance(doc, RegionDocument): doc = Region.pad_region_doc(doc) loc = doc.get_position(build=build).split(':') pos = loc[1].replace(',', '').split('-') locs.append(ElasticObject( {'feature': query_str, 'chr': loc[0], 'start': int(pos[0]), 'end': int(pos[1]) if len(pos) > 1 else int(pos[0]), 'locusString': query_str+" ("+str(loc[1])+")"})) return locs except (TypeError, ValueError, IndexError, ConnectionError): raise Http404
def _gene_lookup(search_term): ''' Look for any gene symbols (e.g. PTPN22) and get the corresponding Ensembl ID and append to query string ''' if re.compile(r'[^\w\s]').findall(search_term): logger.debug('skip gene lookup as contains non-word pattern '+search_term) return search_term words = re.sub("[^\w]", " ", search_term) equery = BoolQuery(b_filter=Filter(Query.query_string(words, fields=['symbol']))) search_query = ElasticQuery(equery, sources=['symbol']) (idx, idx_type) = ElasticSettings.idx('GENE', 'GENE').split('/') result = Search(search_query=search_query, size=10, idx=idx, idx_type=idx_type).search() if result.hits_total > 0: return ' '.join([doc.doc_id() for doc in result.docs]) + ' ' + search_term return search_term
def _build_exon_query(chrom, segmin, segmax, genes): # get exonic structure for genes in this section geneExons = dict() query_bool = BoolQuery() query_bool.must([Query.term("seqid", chrom)]) if len(genes) > 0: for g in genes: query = ElasticQuery.filtered_bool(Query.query_string(g["gene_id"], fields=["name"]), query_bool, sources=utils.snpFields) elastic = Search(query, idx=getattr(chicp_settings, 'CP_GENE_IDX')+'/exons/', search_from=0, size=2000) result = elastic.get_result() exons = result['data'] exons = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], exons) geneExons[g["gene_id"]] = sorted(exons, key=operator.itemgetter("start")) return geneExons
def marker_page(request): ''' Renders a gene page. ''' query_dict = request.GET marker = query_dict.get("m") if marker is None: messages.error(request, 'No gene name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = Document(hit) if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) criteria = {} if marker_doc is not None: if ElasticSettings.idx('CRITERIA') is not None: criteria = views.get_criteria([marker_doc], 'marker', 'id', 'MARKER') marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) context = { 'marker': marker_doc, 'old_dbsnp_docs': _get_old_dbsnps(marker), 'ic': ic_docs, 'history': history_docs, 'criteria': criteria } return render(request, 'marker/marker.html', context, content_type='text/html') elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def _get_old_dbsnps(marker): ''' Get markers from old versions of DBSNP. Assumes the index key is prefixed by 'MARKER_'. ''' old_dbsnps_names = sorted([ElasticSettings.idx(k) for k in ElasticSettings.getattr('IDX').keys() if 'MARKER_' in k], reverse=True) old_dbsnp_docs = [] if len(old_dbsnps_names) > 0: search_query = ElasticQuery(Query.query_string(marker, fields=['id', 'rscurrent'])) for idx_name in old_dbsnps_names: elastic2 = Search(search_query=search_query, idx=idx_name, idx_type='marker') docs = elastic2.search().docs if len(docs) > 0: old_doc = docs[0] old_doc.marker_build = _get_marker_build(idx_name) old_dbsnp_docs.append(old_doc) return old_dbsnp_docs
def test_function_score_query2(self): ''' Test multiple function score query with a query. ''' score_function1 = ScoreFunction.create_score_function('field_value_factor', field='start') score_function2 = ScoreFunction.create_score_function('field_value_factor', field='start') query_string = Query.query_string("rs*", fields=["id"]) query = ElasticQuery(FunctionScoreQuery(query_string, [score_function1, score_function2], score_mode='sum', boost_mode='replace', min_score=1., max_boost=100000000.), sources=['start']) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = sys.maxsize for doc in docs: start = getattr(doc, 'start') self.assertGreater(last_start, start) last_start = start
def get_marker(cls, request, marker, context): if marker is None: messages.error(request, 'No marker name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() title = '' if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = PydginDocument.factory(hit) if doc.get_name() is not None: title = doc.get_name() if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) if marker_doc is not None: marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) criteria_disease_tags = MarkerView.criteria_disease_tags(request, [marker]) context['criteria'] = criteria_disease_tags context['features'] = [marker_doc] context['old_dbsnp_docs'] = _get_old_dbsnps(marker) context['ic'] = ic_docs context['history'] = history_docs context['title'] = title context['jbrowse_tracks'] = "PydginRegions%2Cdbsnp146%2CEnsemblGenes" return context elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def studies_details(request): """ Get studies for a given ensembl ID. """ ens_id = request.POST.get("ens_id") sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx("REGION", "STUDY_HITS"), size=500) study_hits = elastic.get_json_response()["hits"] ens_ids = [] pmids = [] for hit in study_hits["hits"]: if "pmid" in hit["_source"]: pmids.append(hit["_source"]["pmid"]) for ens_id in hit["_source"]["genes"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) pub_docs = _get_pub_docs_by_pmid(pmids, sources=["authors.name", "journal"]) for hit in study_hits["hits"]: genes = {} for ens_id in hit["_source"]["genes"]: try: genes[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genes = {ens_id: ens_id} hit["_source"]["genes"] = genes if "pmid" in hit["_source"]: pmid = hit["_source"]["pmid"] try: authors = getattr(pub_docs[pmid], "authors") journal = getattr(pub_docs[pmid], "journal") hit["_source"]["pmid"] = { "pmid": pmid, "author": authors[0]["name"].rsplit(None, 1)[-1], "journal": journal, } except KeyError: hit["_source"]["pmid"] = {"pmid": pmid} return JsonResponse(study_hits)
def _build_filters(self, filters=None): ''' Build filters using L{AndFilter}. ''' if filters is None: filters = {} and_filter = None for filter_expr, value in filters.items(): filter_bits = filter_expr.split('__') field_name = filter_bits.pop(0) filter_type = 'exact' if len(filter_bits): filter_type = filter_bits.pop() if filter_type != 'exact': field_name = field_name + "." + filter_type q = Query.query_string(value, fields=[field_name]).query_wrap() if and_filter is None: and_filter = AndFilter(q) else: and_filter.extend(q) return and_filter
def genesets_details(request): """ Get pathway gene sets for a given ensembl ID. """ ens_id = request.POST.get("ens_id") geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx("GENE", "PATHWAY"), size=500) genesets_hits = elastic.get_json_response()["hits"] ens_ids = [] for hit in genesets_hits["hits"]: for ens_id in hit["_source"]["gene_sets"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) for hit in genesets_hits["hits"]: genesets = {} for ens_id in hit["_source"]["gene_sets"]: try: genesets[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genesets[ens_id] = ens_id hit["_source"]["gene_sets"] = genesets return JsonResponse(genesets_hits)
def genesets_details(request): ''' Get pathway gene sets for a given ensembl ID. ''' ens_id = request.POST.get('ens_id') geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx('GENE', 'PATHWAY'), size=500) genesets_hits = elastic.get_json_response()['hits'] ens_ids = [] for hit in genesets_hits['hits']: for ens_id in hit['_source']['gene_sets']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) for hit in genesets_hits['hits']: genesets = {} for ens_id in hit['_source']['gene_sets']: try: genesets[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genesets[ens_id] = ens_id hit['_source']['gene_sets'] = genesets return JsonResponse(genesets_hits)
def chicpeaSearch(request, url): queryDict = request.GET user = request.user targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() searchTerm = searchTerm.replace(",", "") searchTerm = searchTerm.replace("..", "-") snpTrack = queryDict.get("snp_track") (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( user=user, idx_keys=None, idx_type_keys=None) if snpTrack: mo = re.match(r"(.*)-(.*)", snpTrack) (group, track) = mo.group(1, 2) # @UnusedVariable if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth: snpTrack = None if targetIdx not in utils.tissues: for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: if targetIdx == target: retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'} return JsonResponse(retJSON) continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm): searchType = 'region' region = searchTerm if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") chrom = chrom.replace('CHR', "") if re.search("^rs[0-9]+", searchTerm.lower()): searchTerm = searchTerm.lower() addList.append(_find_snp_position(snpTrack, searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm").replace(",", "")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # @UnusedVariable if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"]) geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap())) resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search() if resultObj.hits_total > 1: geneResults = [] resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search() docs = resultObj2.docs gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids)) agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0}) res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg), size=0).search() ensg_count = res.aggs['ensg_agg'].get_buckets() gene_ids = [g['key'] for g in ensg_count] for d in resultObj2.docs: if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids: geneResults.append({ 'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''), 'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''), 'location': "chr" + getattr(d, "seqid") + ":" + locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." + locale.format_string("%d", getattr(d, "end"), grouping=True), }) if len(geneResults) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) elif len(geneResults) > 1: retJSON = { 'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.', 'results': geneResults, 'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location'] } return JsonResponse(retJSON) query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, segmin, segmax) = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues['CP_TARGET_'+targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response
def _get_ens_gene(self, gene_list): genes = re.sub("__", " ", gene_list) query = ElasticQuery(Query.query_string(genes)) result = Search(query, idx=ElasticSettings.idx('GENE', 'GENE')).search() return [doc.doc_id() for doc in result.docs]
def do_identifier_search(cls, identifiers, user=None): source_filter = [ 'symbol', 'synonyms', "dbxrefs.*", # gene 'id', 'rscurrent', 'rshigh', # marker 'study_id', 'study_name', # study 'region_name', 'marker', "region_id"] # regions highlight = Highlight(["symbol", "dbxrefs.*", "region", "region_name", "region_id", "study_id", "study_name", "id", "rscurrent", "rshigh", "marker"]) search_query = ElasticQuery(Query.query_string(" ".join(identifiers), fields=source_filter), highlight=highlight, sources=source_filter) search_idx_keys = ['REGION', 'GENE', 'STUDY', 'MARKER'] search_idx_type_keys = ['REGION', 'GENE', 'STUDY', 'MARKER'] idx_all = [ElasticSettings.idx_names(idx, idx_type=idx_type) for idx, idx_type in zip(search_idx_keys, search_idx_type_keys)] idx_dict = dict(idx_all) search_idx = ','.join(idx_dict.keys()) search_idx_types = ','.join(idx_dict.values()) elastic = Search(search_query=search_query, idx=search_idx, idx_type=search_idx_types) gene_dict = {} region_dict = {} marker_dict = {} study_dict = {} docs = elastic.search().docs for doc in docs: existing_feature_list = [] idx = getattr(doc, '_meta')['_index'] idx_type = getattr(doc, '_meta')['_type'] doc_id = doc.doc_id() highlight = doc.highlight() if highlight is not None: pattern = ".*?<em>(.*?)</em>.*" result = re.match(pattern, str(highlight)) if result is not None: highlight_hit = result.group(1) if idx_type == "studies": feature_id = getattr(doc, "study_id") if highlight_hit not in study_dict: study_dict[highlight_hit] = {} if feature_id in study_dict[highlight_hit]: existing_feature_list = study_dict[highlight_hit] existing_feature_list.append(feature_id) study_dict[highlight_hit] = existing_feature_list if idx_type == "gene": feature_id = doc_id if highlight_hit not in gene_dict: gene_dict[highlight_hit] = {} if feature_id in gene_dict[highlight_hit]: existing_feature_list = gene_dict[highlight_hit] existing_feature_list.append(feature_id) gene_dict[highlight_hit] = existing_feature_list if idx_type == "marker": feature_id = getattr(doc, "id") if highlight_hit not in marker_dict: marker_dict[highlight_hit] = {} if feature_id in marker_dict[highlight_hit]: existing_feature_list = marker_dict[highlight_hit] existing_feature_list.append(feature_id) marker_dict[highlight_hit] = existing_feature_list if idx_type == "region": feature_id = getattr(doc, "region_id") if highlight_hit not in region_dict: region_dict[highlight_hit] = {} if feature_id in region_dict[highlight_hit]: existing_feature_list = region_dict[highlight_hit] existing_feature_list.append(feature_id) region_dict[highlight_hit] = existing_feature_list all_result_dict = {} all_result_dict['gene'] = gene_dict all_result_dict['marker'] = marker_dict all_result_dict['region'] = region_dict all_result_dict['study'] = study_dict original_list = [_id.lower() for _id in identifiers] result_list = list(study_dict.keys()) + list(gene_dict.keys()) + list(marker_dict.keys()) + \ list(region_dict.keys()) result_list = [_id.lower() for _id in result_list] diff_list = set(original_list) - set(result_list) all_result_dict['missing'] = list(diff_list) return all_result_dict
def chicpeaSearch(request, url): queryDict = request.GET user = request.user targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() searchTerm = searchTerm.replace(",", "") searchTerm = searchTerm.replace("..", "-") searchTerm = searchTerm.replace(" ", "") # Chris suggestion to prevent issue with spaces in queries snpTrack = queryDict.get("snp_track") (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( user=user, idx_keys=None, idx_type_keys=None) if snpTrack: mo = re.match(r"(.*)-(.*)", snpTrack) (group, track) = mo.group(1, 2) # @UnusedVariable if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth: snpTrack = None if targetIdx not in utils.tissues: for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: if targetIdx == target: retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'} return JsonResponse(retJSON) continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm): searchType = 'region' region = searchTerm if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") chrom = chrom.replace('CHR', "") if re.search("^rs[0-9]+", searchTerm.lower()): searchTerm = searchTerm.lower() addList.append(_find_snp_position(snpTrack, searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm").replace(",", "")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # @UnusedVariable if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"]) geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap())) resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search() if resultObj.hits_total > 1: geneResults = [] resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search() docs = resultObj2.docs gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids)) agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0}) res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg), size=0).search() ensg_count = res.aggs['ensg_agg'].get_buckets() gene_ids = [g['key'] for g in ensg_count] for d in resultObj2.docs: if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids: geneResults.append({ 'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''), 'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''), 'location': "chr" + getattr(d, "seqid") + ":" + locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." + locale.format_string("%d", getattr(d, "end"), grouping=True), }) if len(geneResults) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) elif len(geneResults) > 1: retJSON = { 'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.', 'results': geneResults, 'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location'] } return JsonResponse(retJSON) query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, segmin, segmax) = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues['CP_TARGET_'+targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response
def _search_engine(query_dict, user_filters, user): ''' Carry out a search and add results to the context object. ''' user_query = query_dict.get("query") query = _gene_lookup(user_query) source_filter = [ 'symbol', 'synonyms', "dbxrefs.*", 'biotype', 'description', # gene 'id', 'rscurrent', 'rshigh', # marker 'journal', 'title', 'tags.disease', # publication 'name', 'code', # disease 'study_id', 'study_name', # study 'region_name', 'marker'] # regions if re.compile(r'^[0-9 ]+$').findall(query): source_filter.append('pmid') # publication - possible PMID(s) search_fields = [] maxsize = 20 if user_filters.getlist("maxsize"): maxsize = int(user_filters.get("maxsize")) # build search_fields from user input filter fields for it in user_filters.items(): if len(it) == 2: if it[0] == 'query': continue parts = it[1].split(":") if len(parts) == 3: search_fields.append(parts[1]+"."+parts[2]) elif len(parts) == 2: search_fields.append(parts[1]) if len(search_fields) == 0: search_fields = list(source_filter) search_fields.extend(['abstract', 'authors.name', # publication 'authors', 'pmids', # study 'markers', 'genes']) # study/region source_filter.extend(['date', 'pmid', 'build_id', 'ref', 'alt', 'chr_band', 'disease_locus', 'disease_loci', 'region_id']) idx_name = query_dict.get("idx") idx_dict = ElasticSettings.search_props(idx_name, user) query_filters = _get_query_filters(user_filters, user) highlight = Highlight(search_fields, pre_tags="<strong>", post_tags="</strong>", number_of_fragments=0) sub_agg = Agg('idx_top_hits', 'top_hits', {"size": maxsize, "_source": source_filter, "highlight": highlight.highlight['highlight']}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("biotypes", "terms", {"field": "biotype", "size": 0}), Agg("categories", "terms", {"field": "_type", "size": 0})]) # create score functions score_fns = _build_score_functions(idx_dict) equery = BoolQuery(must_arr=Query.query_string(query, fields=search_fields), should_arr=_auth_arr(user), b_filter=query_filters, minimum_should_match=1) search_query = ElasticQuery(FunctionScoreQuery(equery, score_fns, boost_mode='replace')) elastic = Search(search_query=search_query, aggs=aggs, size=0, idx=idx_dict['idx'], idx_type=idx_dict['idx_type']) result = elastic.search() mappings = elastic.get_mapping() _update_mapping_filters(mappings, result.aggs) _update_biotypes(user_filters, result) return {'data': _top_hits(result), 'aggs': result.aggs, 'query': user_query, 'idx_name': idx_name, 'fields': search_fields, 'mappings': mappings, 'hits_total': result.hits_total, 'maxsize': maxsize, 'took': result.took}
def chicpeaSearch(request, url): queryDict = request.GET targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() if targetIdx not in utils.tissues: for idx in getattr(chicp_settings, 'TARGET_IDXS'): elasticJSON = Search(idx=idx).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[idx]['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues[idx] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm")): searchType = 'region' region = queryDict.get("searchTerm") if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") if re.search("^rs[0-9]+", queryDict.get("searchTerm").lower()): searchTerm = queryDict.get("searchTerm").lower() addList.append(_find_snp_position(queryDict.get("snp_track"), searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues[targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # print(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues[targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues[targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(queryDict.get("snp_track"), chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues[targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response