def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' if seqid is not None and isinstance(seqid, str) and seqid.startswith("chr"): seqid = seqid else: seqid = 'chr' + str(seqid) if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("gene_symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", lte=start_pos), RangeQuery("featureloc.end", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", gte=start_pos), RangeQuery("featureloc.end", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_bool_filtered_query(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)], should_arr=[RangeQuery("start", gte=10050)]) query_bool.must([Query.term("id", "rs768019142")]) \ .should(RangeQuery("start", gte=10054)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_bool_filtered_query2(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.term("seqid", 1)) query_string = Query.query_string("rs768019142", fields=["id", "seqid"]) query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def get_elastic_query(cls, section=None, config=None): ''' function to build the elastic query object @type section: string @keyword section: The section in the criteria.ini file @type config: string @keyword config: The config object initialized from criteria.ini. @return: L{Query} ''' section_config = config[section] source_fields = [] if 'source_fields' in section_config: source_fields_str = section_config['source_fields'] source_fields = source_fields_str.split(',') if 'mhc' in section: seqid = '6' start_range = 25000000 end_range = 35000000 seqid_param = section_config['seqid_param'] start_param = section_config['start_param'] end_param = section_config['end_param'] if section == 'is_gene_in_mhc': # for region you should make a different query # Defined MHC region as chr6:25,000,000..35,000,000 query = ElasticUtils.range_overlap_query(seqid, start_range, end_range, source_fields, seqid_param, start_param, end_param) elif section == 'is_marker_in_mhc': query_bool = BoolQuery() query_bool.must(RangeQuery("start", lte=end_range)) \ .must(RangeQuery("start", gte=start_range)) \ .must(Query.term("seqid", seqid)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elif section == 'is_region_in_mhc': query = ElasticQuery(Query.term("region_name", "MHC")) elif section == 'marker_is_gwas_significant_in_ic': # build a range query gw_sig_p = 0.00000005 query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p)) else: if len(source_fields) > 0: query = ElasticQuery(Query.match_all(), sources=source_fields) else: # query = ElasticQuery(Query.match_all()) return None return query
def test_bool_filtered_query4(self): ''' Test building and running a filtered boolean query. Note: ElasticQuery used to wrap match in a query object. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_bool_query(self): ''' Test a bool query. ''' query_bool = BoolQuery() highlight = Highlight(["id", "seqid"]) query_bool.must(Query.term("id", "rs768019142")) \ .must(RangeQuery("start", gt=1000)) \ .must_not(Query.match("seqid", "2")) \ .should(Query.match("seqid", "3")) \ .should(Query.match("seqid", "1")) query = ElasticQuery.bool(query_bool, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
def fetch_overlapping_features(cls, build, seqid, start, end, idx=None, idx_type=None, disease_id=None): ''' function to create fetch overlapping features for a given stretch of region the build info is stored as nested document..so nested query is build @type build: string @param build: build info eg: 'GRCh38' @type seqid: string @param seqid: chromosome number @type start: string @param start: region start @type end: string @param end: region end @type idx: string @param idx: name of the index @type idx_type: string @param idx_type: name of the idx type, each criteria is an index type @type disease_id: string @param disease_id: disease code ''' nbuild = build start_range = start end_range = end bool_range = BoolQuery() bool_range.must(RangeQuery("build_info.start", lte=start_range)) \ .must(RangeQuery("build_info.end", gte=end_range)) or_filter = OrFilter(RangeQuery("build_info.start", gte=start_range, lte=end_range)) or_filter.extend(RangeQuery("build_info.end", gte=start_range, lte=end_range)) \ .extend(bool_range) bool_query = BoolQuery() if disease_id: qnested_buildinfo = Query.nested('build_info', bool_query) bool_query = BoolQuery() bool_query.must(Query.term("disease", disease_id.lower())).must(qnested_buildinfo) qnested = ElasticQuery(bool_query, sources=['build_info.*', 'disease_locus', 'disease', 'chr_band', 'species']) else: bool_query.must(Query.term("build_info.build", nbuild)) \ .must(Query.term("build_info.seqid", seqid)) \ .filter(or_filter) qnested = ElasticQuery(Query.nested('build_info', bool_query), sources=['build_info.*', 'disease_locus', 'disease', 'chr_band', 'species']) elastic = Search(qnested, idx=idx, idx_type=idx_type) res = elastic.search() return res.docs
def _build_exon_query(chrom, segmin, segmax, genes): # get exonic structure for genes in this section geneExons = dict() query_bool = BoolQuery() query_bool.must([Query.term("seqid", chrom)]) if len(genes) > 0: for g in genes: query = ElasticQuery.filtered_bool(Query.query_string(g["gene_id"], fields=["name"]), query_bool, sources=utils.snpFields) elastic = Search(query, idx=getattr(chicp_settings, 'CP_GENE_IDX')+'/exons/', search_from=0, size=2000) result = elastic.get_result() exons = result['data'] exons = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], exons) geneExons[g["gene_id"]] = sorted(exons, key=operator.itemgetter("start")) return geneExons
def _get_query_filters(q_dict, user): ''' Build query bool filter. If biotypes are specified add them to the filter and allow for other non-gene types. @type q_dict: dict @param q_dict: request dictionary. ''' if not q_dict.getlist("biotypes"): return None query_bool = BoolQuery() if q_dict.getlist("biotypes"): query_bool.should(Query.terms("biotype", q_dict.getlist("biotypes"))) type_filter = [Query.query_type_for_filter(ElasticSettings.search_props(c.upper(), user)['idx_type']) for c in q_dict.getlist("categories") if c != "gene"] if len(type_filter) > 0: query_bool.should(type_filter) return Filter(query_bool)
def test_and_filtered_query(self): ''' Test building and running a filtered query. ''' query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)]) and_filter = AndFilter(query_bool) and_filter.extend(RangeQuery("start", gte=1)) \ .extend(Query.term("seqid", 1)) query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_bool_nested_filter(self): ''' Test combined Bool filter ''' query_bool_nest = BoolQuery() query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query_bool = BoolQuery() query_bool.should(query_bool_nest) \ .should(Query.term("seqid", 2)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_search_count(self): ''' Test index and search counts. ''' idx = IDX['GFF_GENERIC']['indexName'] idx_type = IDX['GFF_GENERIC']['indexType'] count1 = ElasticUtils.get_docs_count(idx, idx_type) self.assertGreater(count1, 0, 'index count') search_query = ElasticQuery( BoolQuery(must_not_arr=[Query.term('seqid', 'chr1')])) count2 = ElasticUtils.get_docs_count(idx, idx_type, search_query=search_query) self.assertGreater(count1, count2, 'search query count')
def _get_current_build_info(self, seqid, position): ''' Get upper & lower boundaries for a hit given the position of the marker.''' query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("position", gte=position), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:asc'), size=1).search() genetic_map_position = getattr(result.docs[0], "genetic_map_position") query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("genetic_map_position", gte=(genetic_map_position + 0.1)), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:asc'), size=1).search() start = int(getattr(result.docs[0], "position")) query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("genetic_map_position", lte=(genetic_map_position - 0.1)), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:desc'), size=1).search() end = int(getattr(result.docs[0], "position")) build_info = {'build': 38, 'seqid': seqid, 'start': start, 'end': end} return build_info
def test_function_score_filter(self): ''' Test a function score query with a filter. ''' score_function = ScoreFunction.create_score_function('field_value_factor', field='start') bool_filter = Filter(BoolQuery(must_arr=[RangeQuery("start", lte=50000)])) query = ElasticQuery(FunctionScoreQuery(bool_filter, [score_function], boost_mode='replace')) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = sys.maxsize for doc in docs: start = getattr(doc, 'start') # test that the start is equal to the score self.assertEqual(start, int(doc.__dict__['_meta']['_score'])) self.assertGreater(last_start, start) last_start = start
def _get_chr_band(self, seqid, position): ''' Get chr band for a given chr/position ''' if seqid == 6 and position >= 24891793 and position <= 34924245: return 'MHC' query = ElasticQuery( BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("start", lte=position), RangeQuery("stop", gte=position) ])) result = Search(query, idx=ElasticSettings.idx('BAND', 'BAND'), size=1).search() return (getattr(result.docs[0], "seqid") + getattr(result.docs[0], "name"))
def range_overlap_query(cls, seqid, start_range, end_range, field_list=None, seqid_param="seqid", start_param="start", end_param="end"): ''' Constructs a range overlap query ''' query_bool = BoolQuery(must_arr=[ RangeQuery(start_param, lte=start_range), RangeQuery(end_param, gte=end_range) ]) or_filter = OrFilter( RangeQuery(start_param, gte=start_range, lte=end_range)) or_filter.extend(RangeQuery(end_param, gte=start_range, lte=end_range)) \ .extend(query_bool) return ElasticQuery.filtered(Query.term(seqid_param, seqid), or_filter, field_list)
def chicpeaSearch(request, url): queryDict = request.GET targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() if targetIdx not in utils.tissues: for idx in getattr(chicp_settings, 'TARGET_IDXS'): elasticJSON = Search(idx=idx).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[idx]['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues[idx] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm")): searchType = 'region' region = queryDict.get("searchTerm") if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") if re.search("^rs[0-9]+", queryDict.get("searchTerm").lower()): searchTerm = queryDict.get("searchTerm").lower() addList.append(_find_snp_position(queryDict.get("snp_track"), searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues[targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # print(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues[targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues[targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(queryDict.get("snp_track"), chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues[targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', { 'username': '******', 'password': '******' }) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get( id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter( "field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', {'username': '******', 'password': '******'}) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get(id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names : group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names : group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter("field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")
def chicpeaSearch(request, url): queryDict = request.GET user = request.user targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() searchTerm = searchTerm.replace(",", "") searchTerm = searchTerm.replace("..", "-") searchTerm = searchTerm.replace(" ", "") # Chris suggestion to prevent issue with spaces in queries snpTrack = queryDict.get("snp_track") (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( user=user, idx_keys=None, idx_type_keys=None) if snpTrack: mo = re.match(r"(.*)-(.*)", snpTrack) (group, track) = mo.group(1, 2) # @UnusedVariable if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth: snpTrack = None if targetIdx not in utils.tissues: for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: if targetIdx == target: retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'} return JsonResponse(retJSON) continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm): searchType = 'region' region = searchTerm if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") chrom = chrom.replace('CHR', "") if re.search("^rs[0-9]+", searchTerm.lower()): searchTerm = searchTerm.lower() addList.append(_find_snp_position(snpTrack, searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm").replace(",", "")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # @UnusedVariable if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"]) geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap())) resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search() if resultObj.hits_total > 1: geneResults = [] resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search() docs = resultObj2.docs gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids)) agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0}) res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg), size=0).search() ensg_count = res.aggs['ensg_agg'].get_buckets() gene_ids = [g['key'] for g in ensg_count] for d in resultObj2.docs: if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids: geneResults.append({ 'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''), 'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''), 'location': "chr" + getattr(d, "seqid") + ":" + locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." + locale.format_string("%d", getattr(d, "end"), grouping=True), }) if len(geneResults) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) elif len(geneResults) > 1: retJSON = { 'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.', 'results': geneResults, 'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location'] } return JsonResponse(retJSON) query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, segmin, segmax) = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues['CP_TARGET_'+targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response
def chicpeaSearch(request, url): queryDict = request.GET user = request.user targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() searchTerm = searchTerm.replace(",", "") searchTerm = searchTerm.replace("..", "-") snpTrack = queryDict.get("snp_track") (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( user=user, idx_keys=None, idx_type_keys=None) if snpTrack: mo = re.match(r"(.*)-(.*)", snpTrack) (group, track) = mo.group(1, 2) # @UnusedVariable if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth: snpTrack = None if targetIdx not in utils.tissues: for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: if targetIdx == target: retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'} return JsonResponse(retJSON) continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm): searchType = 'region' region = searchTerm if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") chrom = chrom.replace('CHR', "") if re.search("^rs[0-9]+", searchTerm.lower()): searchTerm = searchTerm.lower() addList.append(_find_snp_position(snpTrack, searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm").replace(",", "")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # @UnusedVariable if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"]) geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap())) resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search() if resultObj.hits_total > 1: geneResults = [] resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search() docs = resultObj2.docs gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids)) agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0}) res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg), size=0).search() ensg_count = res.aggs['ensg_agg'].get_buckets() gene_ids = [g['key'] for g in ensg_count] for d in resultObj2.docs: if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids: geneResults.append({ 'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''), 'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''), 'location': "chr" + getattr(d, "seqid") + ":" + locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." + locale.format_string("%d", getattr(d, "end"), grouping=True), }) if len(geneResults) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) elif len(geneResults) > 1: retJSON = { 'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.', 'results': geneResults, 'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location'] } return JsonResponse(retJSON) query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, segmin, segmax) = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues['CP_TARGET_'+targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response