def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) search_filters = self._build_filters(filters=filters) if search_filters is not None: q = ElasticQuery.filtered(Query.match_all(), search_filters) else: q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=getattr(view, 'idx'), size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def get_elastic_query(cls, section=None, config=None): ''' function to build the elastic query object @type section: string @keyword section: The section in the criteria.ini file @type config: string @keyword config: The config object initialized from criteria.ini. @return: L{Query} ''' section_config = config[section] source_fields = [] if 'source_fields' in section_config: source_fields_str = section_config['source_fields'] source_fields = source_fields_str.split(',') if 'mhc' in section: seqid = '6' start_range = 25000000 end_range = 35000000 seqid_param = section_config['seqid_param'] start_param = section_config['start_param'] end_param = section_config['end_param'] if section == 'is_gene_in_mhc': # for region you should make a different query # Defined MHC region as chr6:25,000,000..35,000,000 query = ElasticUtils.range_overlap_query(seqid, start_range, end_range, source_fields, seqid_param, start_param, end_param) elif section == 'is_marker_in_mhc': query_bool = BoolQuery() query_bool.must(RangeQuery("start", lte=end_range)) \ .must(RangeQuery("start", gte=start_range)) \ .must(Query.term("seqid", seqid)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elif section == 'is_region_in_mhc': query = ElasticQuery(Query.term("region_name", "MHC")) elif section == 'marker_is_gwas_significant_in_ic': # build a range query gw_sig_p = 0.00000005 query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p)) else: if len(source_fields) > 0: query = ElasticQuery(Query.match_all(), sources=source_fields) else: # query = ElasticQuery(Query.match_all()) return None return query
def test_region_attributes(self): ''' test region attributes ''' idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, 'REGION') (idx, idx_type) = idx.split('/') docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1) newRegion = utils.Region.pad_region_doc(docs[0]) if len(getattr(newRegion, "genes")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "genes"))) resultObject = Search(query, idx=ElasticSettings.idx('GENE', 'GENE'), size=len(getattr(newRegion, "genes"))).search() self.assertEqual(len(getattr(newRegion, "genes")), resultObject.hits_total, "All genes on region found in GENE index") if len(getattr(newRegion, "studies")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "studies"))) resultObject = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=len(getattr(newRegion, "studies"))).search() self.assertEqual(len(getattr(newRegion, "studies")), resultObject.hits_total, "All study ids for region found in STUDY index") if len(getattr(newRegion, "pmids")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "pmids"))) resultObject = Search(query, idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'), size=len(getattr(newRegion, "pmids"))).search() self.assertEqual(len(getattr(newRegion, "pmids")), resultObject.hits_total, "All PMIDs for region found in PUBLICATION index")
def test_error(self): score_function = ScoreFunction.create_score_function('field_value_factor', field='start') self.assertRaises(QueryError, FunctionScoreQuery, 'test_not_query', [score_function]) self.assertRaises(QueryError, FunctionScoreQuery, Query.match_all(), ['test_not_function_score']) self.assertRaises(QueryError, ScoreFunction.create_score_function, 'blah') self.assertRaises(QueryError, ScoreFunction.create_score_function, 'field_value_factor', random_scoress='val') self.assertRaises(QueryError, ScoreFunction.create_score_function, 'field_value_factor', field=10)
def _check_gene_history(cls, gene_sets, config): '''find a way to handle this better''' section = config['GENE_HISTORY'] newgene_ids = {} discountinued_geneids = [] def process_hits(resp_json): hits = resp_json['hits']['hits'] docs = [Document(hit) for hit in hits] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets), sources=['geneid', 'discontinued_geneid']) ScanAndScroll.scan_and_scroll(section['index'], idx_type=section['index_type'], call_fun=process_hits, query=query) return (newgene_ids, discountinued_geneids)
def gene_mgi_parse(cls, gene_pubs, idx): ''' Parse Ensembl and MGI data from JAX. ''' orthogenes_mgi = {} for gene_mgi in gene_pubs: parts = gene_mgi.split('\t') if 'MGI:' not in parts[0]: raise PipelineError('MGI not found '+parts[0]) if 'ENSMUSG' not in parts[5]: raise PipelineError('ENSMUSG not found '+parts[5]) orthogenes_mgi[parts[5]] = parts[0].replace('MGI:', '') orthogene_keys = list(orthogenes_mgi.keys()) chunk_size = 450 for i in range(0, len(orthogene_keys), chunk_size): chunk_gene_keys = orthogene_keys[i:i+chunk_size] json_data = '' query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", chunk_gene_keys)) docs = Search(query, idx=idx, size=chunk_size).search().docs for doc in docs: ens_id = doc.doc_id() idx_type = doc.type() mm = getattr(doc, 'dbxrefs')['orthologs']['mmusculus'] mm['MGI'] = orthogenes_mgi[mm['ensembl']] dbxrefs = {"dbxrefs": {'orthologs': {"mmusculus": mm}}} doc_data = {"update": {"_id": ens_id, "_type": idx_type, "_index": idx, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': dbxrefs}) + '\n' if json_data != '': Loader().bulk_load(idx, idx_type, json_data)
def test_bulk(self): ''' Test the Bulk.load(). ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \ (idx, 'marker') json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".", "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"}) resp = Bulk.load(idx, '', json_data) self.assertNotEquals(resp.status_code, 200) # note: needs a trailing line return to work Bulk.load(idx, '', json_data + '\n') Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1+1, "contains documents") # produce errors updating doc id that doesn't exist json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"doc": {"start": 100, "end": 200}}\n' resp = Bulk.load(idx, '', json_data) self.assertTrue('errors' in resp.json() and resp.json()['errors'])
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) print(filterable) print(request) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) criteria_idx = self._get_index(filters.get('feature_type', 'GENE_CRITERIA')) idx = criteria_idx if type(criteria_idx) == list: idx = ','.join(ElasticSettings.idx(name) for name in criteria_idx) else: idx = ElasticSettings.idx(criteria_idx) q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=idx, size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] new_obj.criteria_type = result['_type'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def get_rdm_feature_id(cls, idx, idx_type, qbool=Query.match_all(), sources=[], field=None): ''' Get a random feature id from the indices. ''' doc = cls.get_rdm_docs(idx, idx_type, qbool=qbool, sources=sources, size=1)[0] if field is not None: return getattr(doc, field) return doc.doc_id()
def _ensembl_entrez_lookup(cls, ensembl_gene_sets, section): ''' Get an ensembl:entrez id dictionary. ''' equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.ensembl", ensembl_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) docs = Search(equery, idx=section['index'], size=len(ensembl_gene_sets)).search().docs return {doc.doc_id(): getattr(doc, 'dbxrefs')['entrez'] for doc in docs}
def test_doc(self): ''' Test return correct type of FeatureDocument. ''' idx = PydginTestSettings.IDX['GENE']['indexName'] idx_type = PydginTestSettings.IDX['GENE']['indexType'] res = Search(search_query=ElasticQuery(Query.match_all(), sources=['symbol']), idx=idx, idx_type=idx_type, size=2).search() for doc in res.docs: self.assertTrue(isinstance(doc, GeneDocument))
def test_hit_attributes(self): '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query''' for idx_type_key in RegionDataTest.IDX_TYPE_KEYS: idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, idx_type_key) (idx, idx_type) = idx.split('/') docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
def check_hits(resp_json): rsids = {} docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc in docs: rsid = getattr(doc, "id") if rsid is not None: rsids[rsid] = doc rsids_keys = list(rsids.keys()) terms_filter = TermsFilter.get_terms_filter("id", rsids_keys) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys)) docs_by_rsid = elastic.search().docs for doc in docs_by_rsid: info = getattr(doc, "info") if 'VC=SNV' not in info: continue rsid = getattr(doc, "id") ic_doc = rsids[rsid] pos1 = getattr(doc, "start") pos2 = self._get_highest_build(ic_doc)['position'] if abs(int(pos1) - int(pos2)) > 1: is_par = getattr(ic_doc, 'is_par') allele_a = getattr(ic_doc, 'allele_a') if is_par is None and not (allele_a == 'D' or allele_a == 'I'): msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') + ' '+str(pos2)+" "+rsid+' '+str(pos1)) # ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')' query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')), Filter(Query.term("start", pos2))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")" query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " (rshigh:"+str(getattr(d, "rshigh")) + \ " build_id:"+str(getattr(d, "build_id"))+")" logger.error(msg)
def test_doc2(self): ''' Test return correct type of FeatureDocument using multiple index search. ''' idx = PydginTestSettings.IDX['GENE']['indexName'] + ',' + PydginTestSettings.IDX['DISEASE']['indexName'] res = Search(search_query=ElasticQuery(Query.match_all(), sources=['symbol', 'code']), idx=idx, size=40).search() for doc in res.docs: self.assertTrue(isinstance(doc, GeneDocument) or isinstance(doc, DiseaseDocument)) if isinstance(doc, DiseaseDocument): self.assertTrue(hasattr(doc, 'code'))
def test_bool_filtered_query(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)], should_arr=[RangeQuery("start", gte=10050)]) query_bool.must([Query.term("id", "rs768019142")]) \ .should(RangeQuery("start", gte=10054)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_sort_query(self): ''' Test sorting for a query. ''' query = ElasticQuery(Query.match_all()) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score')) self._check_sort_order(elastic.search().docs) qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]}) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort) self._check_sort_order(elastic.search().docs) self.assertRaises(QueryError, Sort, 1)
def _entrez_ensembl_lookup(cls, gene_sets, section, config=None): ''' Get an entrez:ensembl id dictionary. ''' (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config) replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) docs = Search(equery, idx=section['index'], size=len(replaced_gene_sets)).search().docs return {getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}
def test_gene_attributes(self): '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query''' idx_key = 'GENE' idx_type_key = 'GENE' idx = ElasticSettings.idx(idx_key, idx_type_key) (idx, idx_type) = idx.split('/') docs_by_geneid = DataIntegrityUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1) # "_source":{"symbol": "RP11-376M2.2", "start": 42975689, "biotype": "sense_intronic", "chromosome": "17", # "source": "havana", "strand": "-", "stop": 42977275} for doc in docs_by_geneid: gene_id_pipeline = doc.doc_id() index_pipeline = doc.index() start_pipeline = getattr(doc, "start") stop_pipeline = getattr(doc, "stop") chromosome_pipeline = getattr(doc, "chromosome") biotype_pipeline = getattr(doc, "biotype") strand_pipeline = getattr(doc, "strand") strand_pipeline = -1 if strand_pipeline == '-' else 1 symbol_pipeline = getattr(doc, "symbol") source_pipeline = getattr(doc, "source") # genes_hg38_v0.0.2 pattern = re.compile('genes_\w\w(\d+)', re.IGNORECASE) match = pattern.match(index_pipeline) assembly_number_pipeline = None if match: assembly_number_pipeline = match.group(1) ensembl_gene_data = DataIntegrityUtils.fetch_from_ensembl(gene_id_pipeline) if ensembl_gene_data: pattern = re.compile('GRCh(\d+)', re.IGNORECASE) match = pattern.match(ensembl_gene_data['assembly_name']) assembly_number_ens = None if match: assembly_number_ens = match.group(1) self.assertEqual(assembly_number_pipeline, assembly_number_ens, "Assembly number is ok") self.assertEqual(gene_id_pipeline, ensembl_gene_data['id'], "Gene Id number is ok") self.assertEqual(start_pipeline, ensembl_gene_data['start'], "start is ok") self.assertEqual(stop_pipeline, ensembl_gene_data['end'], "stop is ok") self.assertEqual(chromosome_pipeline, ensembl_gene_data['seq_region_name'], "chr is ok") self.assertEqual(strand_pipeline, ensembl_gene_data['strand'], "strand is ok") self.assertEqual(biotype_pipeline, ensembl_gene_data['biotype'], "biotype is ok") self.assertEqual(symbol_pipeline, ensembl_gene_data['display_name'], "symbol/display_name is ok") self.assertEqual(source_pipeline, ensembl_gene_data['source'], "source is ok") else: logger.warn("No test run....no ensembl data via ensembl webservice")
def get_studies(cls, study_ids=None, disease_code=None, sources=[], split_name=True): studies_query = ElasticQuery(Query.match_all(), sources=sources) if disease_code is not None: studies_query = ElasticQuery(BoolQuery(must_arr=Query.term("diseases", disease_code)), sources=sources) elif study_ids: studies_query = ElasticQuery(Query.ids(study_ids), sources=sources) studies = Search(studies_query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=200).search().docs for doc in studies: if split_name and getattr(doc, 'study_name') is not None: setattr(doc, 'study_name', getattr(doc, 'study_name').split(':', 1)[0]) return Document.sorted_alphanum(studies, "study_id")
def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1): ''' Get a random doc from the indices. ''' score_function1 = ScoreFunction.create_score_function('random_score', seed=random.randint(0, 1000000)) search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'), sources=sources) elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type) try: return elastic.search().docs except IndexError: return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
def docs_by_query(cls, idx, idx_type='', query=Query.match_all()): ''' Delete all documents specified by a Query. ''' def delete_docs(resp_json): hits = resp_json['hits']['hits'] json_data = '' for hit in hits: json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (hit['_index'], hit['_type'], hit['_id']) Bulk.load(idx, idx_type, json_data) query = ElasticQuery(query, sources='_id') ScanAndScroll.scan_and_scroll(idx, idx_type=idx_type, call_fun=delete_docs, query=query)
def get_rdm_feature_ids(cls, idx, idx_type, qbool=Query.match_all(), sources=[], field=None, size=1): ''' Get random feature_ids from the indices. ''' docs = cls.get_rdm_docs(idx, idx_type, qbool=qbool, sources=sources, size=size) ids = [] for doc in docs: if field is not None: ids.append(getattr(doc, field)) else: ids.append(doc.doc_id()) return ids
def test_bool_nested_filter(self): ''' Test combined Bool filter ''' query_bool_nest = BoolQuery() query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query_bool = BoolQuery() query_bool.should(query_bool_nest) \ .should(Query.term("seqid", 2)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
def test_bool_filtered_query4(self): ''' Test building and running a filtered boolean query. Note: ElasticQuery used to wrap match in a query object. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def get_site_diseases(cls, tier=None): ''' Returns a list of disease documents separated into main and other based on tier @type tier: integer @keyword tier: Tier to filter diseases by (default: None). ''' idx = ElasticSettings.idx('DISEASE', 'DISEASE') query = Query.match_all() if tier is not None: query = FilteredQuery(Query.match_all(), Filter(Query.term("tier", tier))) resultObj = Search(search_query=ElasticQuery(query), idx=idx, qsort=Sort('code:asc')).search() main = [] other = [] for doc in resultObj.docs: if getattr(doc, "tier") == 0: main.append(doc) elif getattr(doc, "tier") == 1: other.append(doc) return (main, other)
def post(self, request, *args, **kwargs): ens_id = self.request.POST.get('ens_id') marker = self.request.POST.get('marker') markers = self.request.POST.getlist('markers[]') if ens_id: sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) elif marker: sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap()) elif markers: sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500) study_hits = elastic.get_json_response()['hits'] ens_ids = [] pmids = [] for hit in study_hits['hits']: if 'pmid' in hit['_source']: pmids.append(hit['_source']['pmid']) if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal']) for hit in study_hits['hits']: genes = {} if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: try: genes[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genes = {ens_id: ens_id} hit['_source']['genes'] = genes if 'pmid' in hit['_source']: pmid = hit['_source']['pmid'] try: authors = getattr(pub_docs[pmid], 'authors') journal = getattr(pub_docs[pmid], 'journal') hit['_source']['pmid'] = \ {'pmid': pmid, 'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "", 'journal': journal} except KeyError: hit['_source']['pmid'] = {'pmid': pmid} return JsonResponse(study_hits)
def _entrez_ensembl_lookup(cls, gene_sets, section, config=None): ''' Get an entrez:ensembl id dictionary. ''' (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config) replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) lookup = {} def process_hits(resp_json): hits = resp_json['hits']['hits'] docs = [Document(hit) for hit in hits] lookup.update({getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}) equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) ScanAndScroll.scan_and_scroll(section['index'], call_fun=process_hits, query=equery) return lookup
def _check_gene_history(cls, gene_sets, section): query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets)) docs = Search(query, idx=section['index'], idx_type=section['index_type_history'], size=1000000).search().docs newgene_ids = {} discountinued_geneids = [] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) return (newgene_ids, discountinued_geneids)
def test_delete_docs_by_query(self): ''' Test deleting docs using a query. ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] self.assertGreater(hits_total1, 0, "contains documents") # delete single doc Delete.docs_by_query(idx, query=Query.term("id", "rs2476601")) Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1-1, "contains documents") # delete remaining docs Delete.docs_by_query(idx, 'marker') Search.index_refresh(idx) self.assertEquals(elastic.get_count()['count'], 0, "contains no documents")
def test_doc_auth(self): idx = PydginTestSettings.IDX["STUDY_HITS"]["indexName"] docs = Search(ElasticQuery(Query.match_all(), sources=["chr_band", "marker"]), idx=idx, size=1).search().docs self.assertEquals(len(docs), 1, "STUDY_HITS document") marker_id = getattr(docs[0], "marker") url = reverse("search_page") resp = self.client.post(url + "?idx=ALL&query=" + marker_id) nhits1 = resp.context["hits_total"] self.assertGreater(nhits1, 0, "search hits > 0") # update document to be in DIL update_field = {"doc": {"group_name": "DIL"}} Update.update_doc(docs[0], update_field) Search.index_refresh(PydginTestSettings.IDX["STUDY_HITS"]["indexName"]) url = reverse("search_page") resp = self.client.post(url + "?idx=ALL&query=" + marker_id) nhits2 = resp.context["hits_total"] self.assertEqual(nhits1 - 1, nhits2, "private document hidden")
def test_doc_auth(self): ''' Test private documents are not returned in the search. ''' idx = PydginTestSettings.IDX['MARKER']['indexName'] docs = Search(ElasticQuery(Query.match_all(), sources=['id']), idx=idx, size=1).search().docs self.assertEquals(len(docs), 1, "MARKER document") marker_id = getattr(docs[0], 'id') url = reverse('search_page') resp = self.client.post(url+'?idx=ALL&query='+marker_id) nhits1 = resp.context['hits_total'] self.assertGreater(nhits1, 0, 'search hits > 0') # update document to be in DIL update_field = {"doc": {"group_name": "DIL"}} Update.update_doc(docs[0], update_field) Search.index_refresh(PydginTestSettings.IDX['MARKER']['indexName']) resp = self.client.post(url+'?idx=ALL&query='+marker_id) nhits2 = resp.context['hits_total'] self.assertEqual(nhits1-1, nhits2, 'private document hidden')
def _check_gene_history(cls, gene_sets, config): '''find a way to handle this better''' section = config['GENE_HISTORY'] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets), sources=['geneid', 'discontinued_geneid']) docs = Search(query, idx=section['index'], idx_type=section['index_type'], size=len(gene_sets)).search().docs newgene_ids = {} discountinued_geneids = [] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) return (newgene_ids, discountinued_geneids)
def test_pad_region(self): ''' Test the padding of a region based on it's disease_loci & hits. ''' idx = ElasticSettings.idx(RegionTest.IDX_KEY, 'REGION') (idx, idx_type) = idx.split('/') docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1) region = docs[0] self.assertFalse(getattr(region, "build_info"), "Region doesn't contain any positional details") self.assertFalse(getattr(region, "markers"), "Region doesn't contain any marker details") self.assertFalse(getattr(region, "hits"), "Region doesn't contain any HIT details") self.assertFalse(getattr(region, "genes"), "Region doesn't contain any gene details") self.assertFalse(getattr(region, "studies"), "Region doesn't contain any study details") self.assertFalse(getattr(region, "pmids"), "Region doesn't contain any publication details") newRegion = utils.Region.pad_region_doc(region) self.assertTrue(getattr(newRegion, "build_info"), "New region contains positional details") self.assertTrue(getattr(newRegion, "markers"), "New region contains marker details") self.assertGreaterEqual(len(getattr(newRegion, "markers")), 1, "New region contains at least 1 marker") self.assertTrue(getattr(newRegion, "hits"), "New region contains hit details") self.assertGreaterEqual(len(getattr(newRegion, "hits")), 1, "New region contains at least 1 HIT")
def _update_gene(cls, genes, idx): ''' Use genes data to update the index. ''' gene_keys = list(genes.keys()) chunk_size = 450 for i in range(0, len(genes), chunk_size): chunk_gene_keys = gene_keys[i:i+chunk_size] json_data = '' query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", chunk_gene_keys)) docs = Search(query, idx=idx, size=chunk_size).search().docs for doc in docs: ens_id = doc._meta['_id'] idx_type = doc.type() entrez = getattr(doc, 'dbxrefs')['entrez'] doc_data = {"update": {"_id": ens_id, "_type": idx_type, "_index": idx, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': genes[entrez]}) + '\n' if json_data != '': Loader().bulk_load(idx, idx_type, json_data)
def studies_details(request): """ Get studies for a given ensembl ID. """ ens_id = request.POST.get("ens_id") sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx("REGION", "STUDY_HITS"), size=500) study_hits = elastic.get_json_response()["hits"] ens_ids = [] pmids = [] for hit in study_hits["hits"]: if "pmid" in hit["_source"]: pmids.append(hit["_source"]["pmid"]) for ens_id in hit["_source"]["genes"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) pub_docs = _get_pub_docs_by_pmid(pmids, sources=["authors.name", "journal"]) for hit in study_hits["hits"]: genes = {} for ens_id in hit["_source"]["genes"]: try: genes[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genes = {ens_id: ens_id} hit["_source"]["genes"] = genes if "pmid" in hit["_source"]: pmid = hit["_source"]["pmid"] try: authors = getattr(pub_docs[pmid], "authors") journal = getattr(pub_docs[pmid], "journal") hit["_source"]["pmid"] = { "pmid": pmid, "author": authors[0]["name"].rsplit(None, 1)[-1], "journal": journal, } except KeyError: hit["_source"]["pmid"] = {"pmid": pmid} return JsonResponse(study_hits)
def get_new_pmids(cls, pmids, idx, disease_code=None): ''' Find PMIDs in a list that are not in the elastic index. ''' chunk_size = 800 pmids_found = set() pmids_found_add = pmids_found.add time.sleep(5) for i in range(0, len(pmids), chunk_size): pmids_slice = pmids[i:i+chunk_size] terms_filter = TermsFilter.get_terms_filter("pmid", pmids_slice) query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=['pmid', 'tags']) docs = Search(query, idx=idx, size=chunk_size).search().docs json_data = '' for doc in docs: pmids_found_add(getattr(doc, 'pmid')) if disease_code is not None: tags = getattr(doc, 'tags') if 'disease' in tags: disease = tags['disease'] else: disease = [] if disease_code not in disease: # update disease attribute disease.append(disease_code) tags['disease'] = disease idx_name = doc._meta['_index'] idx_type = doc.type() doc_data = {"update": {"_id": doc._meta['_id'], "_type": idx_type, "_index": idx_name, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': {'tags': tags}}) + '\n' if json_data != '': Loader().bulk_load(idx_name, idx_type, json_data) return [pmid for pmid in pmids if pmid not in pmids_found]
def genesets_details(request): ''' Get pathway gene sets for a given ensembl ID. ''' ens_id = request.POST.get('ens_id') geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx('GENE', 'PATHWAY'), size=500) genesets_hits = elastic.get_json_response()['hits'] ens_ids = [] for hit in genesets_hits['hits']: for ens_id in hit['_source']['gene_sets']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) for hit in genesets_hits['hits']: genesets = {} for ens_id in hit['_source']['gene_sets']: try: genesets[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genesets[ens_id] = ens_id hit['_source']['gene_sets'] = genesets return JsonResponse(genesets_hits)
def genesets_details(request): """ Get pathway gene sets for a given ensembl ID. """ ens_id = request.POST.get("ens_id") geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx("GENE", "PATHWAY"), size=500) genesets_hits = elastic.get_json_response()["hits"] ens_ids = [] for hit in genesets_hits["hits"]: for ens_id in hit["_source"]["gene_sets"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) for hit in genesets_hits["hits"]: genesets = {} for ens_id in hit["_source"]["gene_sets"]: try: genesets[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genesets[ens_id] = ens_id hit["_source"]["gene_sets"] = genesets return JsonResponse(genesets_hits)
def _convert_entrezid2ensembl(cls, gene_sets, section, log_output_file_handler=None, log_conversion=True): '''Converts given set of entrez ids to ensembl ids by querying the gene index dbxrefs''' # first check in gene_history (newgene_ids, discontinued_ids) = cls._check_gene_history(gene_sets, section) # replace all old ids with new ids replaced_gene_sets = cls._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets)) docs = Search(query, idx=section['index'], size=1000000).search().docs ensembl_ids = [] for doc in docs: ens_id = doc._meta['_id'] ensembl_ids.append(ens_id) if log_conversion: if log_output_file_handler is not None: cls._log_entrezid2ensembl_coversion(replaced_gene_sets, ensembl_ids, log_output_file_handler) return ensembl_ids
def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1): ''' Get a random doc from the indices. ''' score_function1 = ScoreFunction.create_score_function( 'random_score', seed=random.randint(0, 1000000)) search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'), sources=sources) elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type) try: return elastic.search().docs except IndexError: return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
def chicpeaSearch(request, url): queryDict = request.GET user = request.user targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() searchTerm = searchTerm.replace(",", "") searchTerm = searchTerm.replace("..", "-") snpTrack = queryDict.get("snp_track") (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( user=user, idx_keys=None, idx_type_keys=None) if snpTrack: mo = re.match(r"(.*)-(.*)", snpTrack) (group, track) = mo.group(1, 2) # @UnusedVariable if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth: snpTrack = None if targetIdx not in utils.tissues: for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: if targetIdx == target: retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'} return JsonResponse(retJSON) continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm): searchType = 'region' region = searchTerm if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") chrom = chrom.replace('CHR', "") if re.search("^rs[0-9]+", searchTerm.lower()): searchTerm = searchTerm.lower() addList.append(_find_snp_position(snpTrack, searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm").replace(",", "")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # @UnusedVariable if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"]) geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap())) resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search() if resultObj.hits_total > 1: geneResults = [] resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search() docs = resultObj2.docs gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids)) agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0}) res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg), size=0).search() ensg_count = res.aggs['ensg_agg'].get_buckets() gene_ids = [g['key'] for g in ensg_count] for d in resultObj2.docs: if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids: geneResults.append({ 'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''), 'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''), 'location': "chr" + getattr(d, "seqid") + ":" + locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." + locale.format_string("%d", getattr(d, "end"), grouping=True), }) if len(geneResults) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) elif len(geneResults) > 1: retJSON = { 'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.', 'results': geneResults, 'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location'] } return JsonResponse(retJSON) query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, segmin, segmax) = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues['CP_TARGET_'+targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response
def _add_diseases(): ''' Add diseases dictionary to a context ''' query = ElasticQuery(Query.match_all()) elastic_disease = Search(search_query=query, size=100, idx='disease') return elastic_disease.get_json_response()['hits']['hits']
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', { 'username': '******', 'password': '******' }) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get( id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter( "field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")