def get_criteria(docs, doc_type, doc_attr, idx_type_key): """ Return a dictionary of gene name:criteria. """ genes = [getattr(doc, doc_attr).lower() for doc in docs if doc.type() == doc_type] query = Query.terms("Name", genes) sources = {"exclude": ["Primary id", "Object class", "Total score"]} if ElasticSettings.idx("CRITERIA", idx_type_key) is None: return {} res = Search( ElasticQuery(query, sources=sources), idx=ElasticSettings.idx("CRITERIA", idx_type_key), size=len(genes) ).search() criteria = {} for doc in res.docs: od = collections.OrderedDict(sorted(doc.__dict__.items(), key=lambda t: t[0])) gene_name = getattr(doc, "Name") criteria[gene_name] = [ {attr.replace("_Hs", ""): value.split(":")} for attr, value in od.items() if attr != "Name" and attr != "_meta" and attr != "OD_Hs" and not value.startswith("0") ] if hasattr(doc, "OD_Hs") and not getattr(doc, "OD_Hs").startswith("0"): if gene_name not in criteria: criteria[gene_name] = [] criteria[gene_name].append({"OD": getattr(doc, "OD_Hs").split(":")}) return criteria
def test_region_attributes(self): ''' test region attributes ''' idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, 'REGION') (idx, idx_type) = idx.split('/') docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1) newRegion = utils.Region.pad_region_doc(docs[0]) if len(getattr(newRegion, "genes")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "genes"))) resultObject = Search(query, idx=ElasticSettings.idx('GENE', 'GENE'), size=len(getattr(newRegion, "genes"))).search() self.assertEqual(len(getattr(newRegion, "genes")), resultObject.hits_total, "All genes on region found in GENE index") if len(getattr(newRegion, "studies")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "studies"))) resultObject = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=len(getattr(newRegion, "studies"))).search() self.assertEqual(len(getattr(newRegion, "studies")), resultObject.hits_total, "All study ids for region found in STUDY index") if len(getattr(newRegion, "pmids")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "pmids"))) resultObject = Search(query, idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'), size=len(getattr(newRegion, "pmids"))).search() self.assertEqual(len(getattr(newRegion, "pmids")), resultObject.hits_total, "All PMIDs for region found in PUBLICATION index")
def _find_snp_position(snp_track, name): if snp_track is None: query = ElasticQuery.query_match("id", name) elastic = Search(query, idx=ElasticSettings.idx('MARKER')) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} else: mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) try: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper()) except SettingsError: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track query = ElasticQuery.query_match("name", name) elastic = Search(query, idx=snp_track_idx) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) print(filterable) print(request) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) criteria_idx = self._get_index(filters.get('feature_type', 'GENE_CRITERIA')) idx = criteria_idx if type(criteria_idx) == list: idx = ','.join(ElasticSettings.idx(name) for name in criteria_idx) else: idx = ElasticSettings.idx(criteria_idx) q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=idx, size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] new_obj.criteria_type = result['_type'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def test_top_hits_sub_agg(self): sub_agg = Agg('idx_top_hits', 'top_hits', {"size": 1}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("categories", "terms", {"field": "_type", "size": 0})]) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) buckets = search.search().aggs['idxs'].get_docs_in_buckets() self.assertEqual(buckets[ElasticSettings.idx('DEFAULT')]['doc_count'], 3) self.assertEqual(len(buckets[ElasticSettings.idx('DEFAULT')]['docs']), 1)
def test_scan_and_scroll(self): ''' Test scan and scroll interface. ''' def check_hits(resp_json): self.assertTrue('hits' in resp_json, 'scan and scroll hits') self.assertGreaterEqual(len(resp_json['hits']['hits']), 1) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits, query=ElasticQuery.query_string("rs2476601", fields=["id"]))
def test_sort_query(self): ''' Test sorting for a query. ''' query = ElasticQuery(Query.match_all()) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score')) self._check_sort_order(elastic.search().docs) qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]}) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort) self._check_sort_order(elastic.search().docs) self.assertRaises(QueryError, Sort, 1)
def test_term_query(self): ''' Test building and running a match query. ''' query = ElasticQuery(Query.term("id", "rs2476601")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)") query = ElasticQuery(Query.term("seqid", "1", boost=3.0)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers on chr1")
def test_query_ids(self): ''' Test by query ids. ''' query = ElasticQuery(Query.ids(['1', '2'])) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)") idx_type = docs[0].type() query = ElasticQuery(Query.ids('2', types=idx_type)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
def test_mapping(self): ''' Test retrieving the mapping for an index. ''' elastic = Search(idx=ElasticSettings.idx('DEFAULT')) mapping = elastic.get_mapping() self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") if ElasticSettings.idx('DEFAULT') in mapping: self.assertTrue("mappings" in mapping[ElasticSettings.idx('DEFAULT')], "Mapping result found") # check using the index type mapping = elastic.get_mapping('marker') self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") # err check mapping = elastic.get_mapping('marker/xx') self.assertTrue('error' in mapping, "Database name in mapping result")
def test_term(self): ''' Terms Aggregation ''' agg_name = "test" agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") ''' Ids Query with Terms Aggregation''' query = ElasticQuery(Query.ids(['1', '2'])) search = Search(search_query=query, aggs=aggs, idx=ElasticSettings.idx('DEFAULT'), size=5) r_aggs = search.search().aggs self.assertTrue(len(r_aggs[agg_name].get_buckets()) > 0, "returned test aggregation buckets") self.assertTrue(getattr(r_aggs[agg_name], 'buckets')[0]['doc_count'] >= 0, "bucket document count")
def setUp(self): # Every test needs access to the request factory. self.factory = RequestFactory() self.group, created = Group.objects.get_or_create(name='READ') # @UnusedVariable self.user = User.objects.create_user(username='******', email='*****@*****.**', password='******') self.user.groups.add(self.group) (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( # @UnusedVariable user=self.user, idx_keys=None, idx_type_keys=None) for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList
def marker_page(request): ''' Renders a gene page. ''' query_dict = request.GET marker = query_dict.get("m") if marker is None: messages.error(request, 'No gene name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = Document(hit) if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) criteria = {} if marker_doc is not None: if ElasticSettings.idx('CRITERIA') is not None: criteria = views.get_criteria([marker_doc], 'marker', 'id', 'MARKER') marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) context = { 'marker': marker_doc, 'old_dbsnp_docs': _get_old_dbsnps(marker), 'ic': ic_docs, 'history': history_docs, 'criteria': criteria } return render(request, 'marker/marker.html', context, content_type='text/html') elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def get_diseases(self): ''' Overridden get diseases for feature. ''' if super(RegionDocument, self).get_diseases(): idx = ElasticSettings.idx('REGION_CRITERIA') diseases = [getattr(d, "code") for d in Criteria.get_disease_tags(getattr(self, "region_id"), idx=idx)] return diseases return []
def test_pubs_disease_tags(self): ''' Check the number of disease publications against the number of tags.disease and report differences`. ''' count = True msg = '' for disease in DiseasePublicationTest.DISEASES: pmids = self._get_pmids(disease) disease_code = disease.lower() elastic = Search(search_query=ElasticQuery(BoolQuery( b_filter=Filter(Query.term('tags.disease', disease_code))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) res = elastic.get_count() msg += disease_code+'\tINDEX: '+str(res['count'])+'\tNCBI: '+str(len(pmids)) if res['count'] != len(pmids): count = False docs = elastic.search().docs pmids_in_idx = [getattr(doc, 'pmid') for doc in docs] pmids_diff1 = [pmid for pmid in pmids_in_idx if pmid not in pmids] pmids_diff2 = [pmid for pmid in pmids if pmid not in pmids_in_idx] if len(pmids_diff1) > 0: msg += '\textra PMIDs: '+str(pmids_diff1) if len(pmids_diff2) > 0: msg += '\tmissing PMIDs: '+str(pmids_diff2) msg += '\n' print(msg) self.assertTrue(count, 'Count for disease tags')
def get_diseases(self): ''' Overridden get diseases for feature. ''' if super(StudyDocument, self).get_diseases(): diseases = [getattr(d, "code") for d in Criteria.get_disease_tags(self.get_name(), idx=ElasticSettings.idx('STUDY_CRITERIA'))] return diseases return []
def get_gene_docs_by_ensembl_id(cls, ens_ids, sources=None): ''' Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. ''' query = ElasticQuery(Query.ids(ens_ids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx('GENE', idx_type='GENE'), size=len(ens_ids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def get_hits_by_study_id(cls, study_id, sources=[]): ''' Get visible/authenticated hits. ''' hits_query = ElasticQuery(BoolQuery(must_arr=Query.term('dil_study_id', study_id), b_filter=Filter(Query.missing_terms("field", "group_name"))), sources=sources) docs = Search(hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=1000).search().docs ens_ids = [gene for doc in docs if getattr(doc, 'genes') for gene in getattr(doc, 'genes')] gene_docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) for doc in docs: if getattr(doc, 'genes'): genes = {} for ens_id in getattr(doc, 'genes'): try: genes[ens_id] = getattr(gene_docs[ens_id], 'symbol') except KeyError: genes = {ens_id: ens_id} setattr(doc, 'genes', genes) build_info = getattr(doc, 'build_info') for bi in build_info: if bi['build'] == settings.DEFAULT_BUILD: setattr(doc, "loc", "chr" + bi['seqid'] + ":" + str(locale.format("%d", bi['start'], grouping=True)) + "-" + str(locale.format("%d", bi['end'], grouping=True))) setattr(doc, "encoded_loc", "chr" + bi['seqid'] + "%3A" + str(bi['start']) + ".." + str(bi['end'])) return docs
def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def test_string_query(self): ''' Test building and running a string query. ''' query = ElasticQuery.query_string("rs2476601", fields=["id"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search() self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)") self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
def gene_in_region(cls, hit, section=None, config=None, result_container={}): try: padded_region_doc = utils.Region.pad_region_doc(Document(hit)) except: logger.warn('Region padding error ') return result_container # 'build_info': {'end': 22411939, 'seqid': '1', 'build': 38, 'start': 22326008}, 'region_id': '1p36.12_008'} region_id = getattr(padded_region_doc, "region_id") region_name = getattr(padded_region_doc, "region_name") build_info = getattr(padded_region_doc, "build_info") diseases = getattr(padded_region_doc, "tags")['disease'] seqid = build_info['seqid'] start = build_info['start'] end = build_info['end'] gene_index = ElasticSettings.idx('GENE', idx_type='GENE') elastic = Search.range_overlap_query(seqid=seqid, start_range=start, end_range=end, idx=gene_index, field_list=['start', 'stop', '_id'], seqid_param="chromosome", end_param="stop", size=10000) result_docs = elastic.search().docs genes = set() for doc in result_docs: genes.add(doc.doc_id()) result_container_populated = cls.populate_container(region_id, region_name, fnotes=None, features=genes, diseases=diseases, result_container=result_container) return result_container_populated
def _categories(idx): idxs = idx.split(",") idx_types = {} for this_idx in idxs: if this_idx+'/marker' == ElasticSettings.idx('MARKER', 'MARKER'): stype = {'type': 'Marker', 'categories': ['synonymous', 'non-synonymous'], 'search': ['in LD of selected']} elif this_idx == ElasticSettings.idx('REGION'): stype = {'type': 'Region'} elif this_idx == ElasticSettings.idx('GENE'): stype = {'type': 'Gene', 'categories': ['protein coding', 'non-coding', 'pseudogene']} else: stype = {'type': 'Other'} idx_types[this_idx] = stype return idx_types
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from Rserve. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) mid1 = filters.get('marker', 'rs2476601') dataset = filters.get('dataset', 'EUR').replace('-', '') query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start']) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1) doc = elastic.search().docs[0] seqid = getattr(doc, 'seqid') rserve = getattr(settings, 'RSERVE') conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT')) pop_str = conn.r.get_pop(dataset, seqid, mid1) pops = json.loads(str(pop_str)) populations = [] for pop in pops: pops[pop]['population'] = pop populations.append(pops[pop]) conn.close() return [ElasticObject(initial={'populations': populations, 'marker': mid1})] except (TypeError, ValueError, IndexError, ConnectionError): return [ElasticObject(initial={'populations': None, 'marker': mid1})]
def _get_pub_docs_by_pmid(pmids, sources=None): """ Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. """ query = ElasticQuery(Query.ids(pmids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx("PUBLICATION"), size=len(pmids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def test_significant_terms(self): ''' Significant Terms Aggregation ''' agg = Agg("test_significant_terms", "significant_terms", {"field": "start"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('test_significant_terms' in r_aggs, "returned aggregations")
def _get_random_marker(self): ''' Get a random marker from the dbSNP elastic index. ''' (idx, idx_type) = ElasticSettings.idx('MARKER', 'MARKER').split('/') seqid = random.randint(1, 10) qbool = BoolQuery(must_arr=[Query.term("seqid", seqid), RangeQuery("tags.weight", gte=80)]) doc = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=qbool, sources=['id', 'start'], size=1)[0] return getattr(doc, 'id')
def test_get_criteria_details(self): config = IniParser().read_ini(MY_INI_FILE) idx = ElasticSettings.idx('MARKER_CRITERIA') available_criterias = MarkerCriteria.get_available_criterias(config=config)['marker'] idx_type = ','.join(available_criterias) doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1) self.assertTrue(len(doc_by_idx_type) == 1) feature_id = getattr(doc_by_idx_type[0], 'qid') criteria_details = MarkerCriteria.get_criteria_details(feature_id, config=config) hits = criteria_details['hits'] first_hit = hits[0] _type = first_hit['_type'] _index = first_hit['_index'] _id = first_hit['_id'] _source = first_hit['_source'] disease_tag = _source['disease_tags'][0] self.assertTrue(feature_id, _id) self.assertIn(_type, idx_type) self.assertTrue(idx, _index) self.assertIn(disease_tag, list(_source.keys())) fdetails = _source[disease_tag][0] self.assertIn('fid', fdetails.keys()) self.assertIn('fname', fdetails.keys())
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_gene_interactions(self): '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query''' # elastic doc example: # "_source":{"interaction_source": "intact", "interactors": [ # {"interactor": "ENSG00000206053", "pubmed": "16169070"}, # {"interactor": "ENSG00000101474", "pubmed": "16169070"}, # {"interactor": "ENSG00000065361", "pubmed": "16169070"}, # {"interactor": "ENSG00000085465", "pubmed": "16169070"}]} idx_key = 'GENE' idx_type_key = 'INTERACTIONS' idx = ElasticSettings.idx(idx_key, idx_type_key) (idx, idx_type) = idx.split('/') # Test doc count doc_count = DataIntegrityUtils.get_docs_count(idx, idx_type) self.assertGreater(doc_count, 23000, 'Gene doc count greater than 60000') # Get interaction doc - passing the interaction source and id . Also test with random id (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex", "ENSG00000241186") self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex) (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex") self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex) (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact", parent_id="ENSG00000090776") self.check_intact_data(child_doc_intact, parent_doc_intact) (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact") self.check_intact_data(child_doc_intact, parent_doc_intact)
def test_filter(self): ''' Filter Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='25000')), Agg('avg_start', 'avg', {"field": 'start'}), Agg('min_start', 'min', {"field": 'start'}), Agg('sum_start', 'sum', {"field": 'start'}), Agg('stats_start', 'stats', {"field": 'start'}), Agg('count_start', 'value_count', {"field": 'start'}), Agg('ext_stats_start', 'extended_stats', {"field": 'start'})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('avg_start' in r_aggs, "returned avg aggregation") self.assertTrue('min_start' in r_aggs, "returned min aggregation") stats_keys = ["min", "max", "sum", "count", "avg"] self.assertTrue(all(hasattr(r_aggs['stats_start'], k) for k in stats_keys), "returned min aggregation") stats_keys.extend(["sum_of_squares", "variance", "std_deviation", "std_deviation_bounds"]) self.assertTrue(all(hasattr(r_aggs['ext_stats_start'], k) for k in stats_keys), "returned min aggregation")
def get_interaction_doc(self, interaction_source='intact', parent_id=None): idx_key = 'GENE' idx_type_key = 'INTERACTIONS' parent_idx_key = 'GENE' idx = ElasticSettings.idx(idx_key, idx_type_key) (idx, idx_type) = idx.split('/') if parent_id: qbool_intact = BoolQuery().must([Query.term("interaction_source", interaction_source), Query.term("_parent", parent_id)]) else: qbool_intact = BoolQuery().should([Query.term("interaction_source", interaction_source)]) # Get random doc or specific if id is passed in query docs_by_geneid = DataIntegrityUtils.get_rdm_docs(idx, idx_type, qbool=qbool_intact, sources=[], size=1) doc = docs_by_geneid[0] # Get parent doc parent_id = doc.parent() parent_docs = DataIntegrityUtils.fetch_from_elastic(idx_key, parent_idx_key, [parent_id]) if parent_docs: self.assertTrue(len(parent_docs) >= 1, "Found 1 parent") parent_doc = parent_docs[0] return doc, parent_doc else: return self.get_interaction_doc("intact", parent_id)
def suggester(request): ''' Provide auto suggestions. Ajax request returning a JSON response. ''' query_dict = request.GET idx_dict = ElasticSettings.search_props(query_dict.get("idx"), request.user) suggester = ','.join(ElasticSettings.idx(k) for k in idx_dict['suggester_keys']) resp = Suggest.suggest(query_dict.get("term"), suggester, name='suggest', size=8)['suggest'] return JsonResponse({"data": [opts['text'] for opts in resp[0]['options']]})
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' if seqid is not None and isinstance(seqid, str) and seqid.startswith("chr"): seqid = seqid else: seqid = 'chr' + str(seqid) if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("gene_symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", lte=start_pos), RangeQuery("featureloc.end", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", gte=start_pos), RangeQuery("featureloc.end", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_gene_criteria_types(self): """Test if the indexes have records""" idx_key = "GENE_CRITERIA" feature_type = "gene" idx = ElasticSettings.idx(idx_key) idx_types = CriteriaDataIntegrityUtils.get_criteria_index_types(idx_key) gene_criterias = Criteria.get_available_criterias(feature_type) CriteriaDataIntegrityTestUtils().test_criteria_types(idx, idx_types, gene_criterias["gene"]) CriteriaDataIntegrityTestUtils().test_criteria_mappings(idx, idx_types) # get random doc for each type ['gene_in_region', 'cand_gene_in_region', 'cand_gene_in_study', 'is_gene_in_mhc'] idx_type = "gene_in_region" doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1) self.assertTrue(len(doc_by_idx_type) == 1, "got back one document") gene_in_region_doc = doc_by_idx_type[0] # {'score': 10, 'CRO': [{'fname': '4p11', 'fid': '4p11_005'}], # '_meta': {'_type': 'gene_in_region', '_score': 0.9997835, # '_index': 'pydgin_imb_criteria_gene', '_id': 'ENSG00000250753'}, # 'disease_tags': ['CRO'], 'qid': 'ENSG00000250753'} qid = getattr(gene_in_region_doc, "qid") print(qid) disease_tags = getattr(gene_in_region_doc, "disease_tags") # ENSG00000248482 # ['IBD', 'UC'] # [{'fid': '5q31.1_013', 'fname': '5q31.1'}] # [{'fid': '5q31.1_013', 'fname': '5q31.1'}] fnotes = getattr(gene_in_region_doc, disease_tags[0]) region_id = fnotes[0]["fid"] print(region_id)
def association_stats(request, sources=None): ''' Get association statistics for a given marker ID. ''' seqid = request.GET.get('chr').replace('chr', '') idx_type = request.GET.get('idx_type').upper() start = request.GET.get('start') end = request.GET.get('end') data = [] def get_stats(resp_json): hits = resp_json['hits']['hits'] for hit in hits: d = Document(hit) data.append({ "CHROM": getattr(d, 'seqid'), "POS": getattr(d, 'position'), "PVALUE": getattr(d, 'p_value'), "DBSNP_ID": getattr(d, 'marker') }) query = ElasticQuery(Query.query_string(seqid, fields=["seqid"]), sources=sources) if start is not None and end is not None: query = ElasticQuery(BoolQuery(must_arr=[Query.query_string(seqid, fields=["seqid"]), RangeQuery("position", gte=start, lte=end)]), sources=sources) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('IC_STATS', idx_type), call_fun=get_stats, query=query) json = {"variants": data} return JsonResponse(json)
def test_missing(self): ''' Missing Aggregation ''' agg = Agg("test_missing", "missing", {"field": "seqid"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(getattr(r_aggs['test_missing'], 'doc_count') == 0, "no missing seqid fields")
def test_top_hits(self): ''' Top Hits Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='2000')), Agg('test_top_hits', 'top_hits', {"size": 1})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) hits = search.search().aggs['test_top_hits'].get_hits() self.assertTrue(len(hits) == 1, "returned the top hit")
def get_publications(cls, pmids, sources=[]): ''' Get publications from the list of PMIDs. ''' if pmids is None or not pmids: return None from elastic.search import Search, ElasticQuery pubs = Search(ElasticQuery(Query.ids(pmids), sources=sources), idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'), size=2).search().docs return pubs
def get_marker(cls, request, marker, context): if marker is None: messages.error(request, 'No marker name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() title = '' if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = PydginDocument.factory(hit) if doc.get_name() is not None: title = doc.get_name() if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) if marker_doc is not None: marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) criteria_disease_tags = MarkerView.criteria_disease_tags(request, [marker]) context['criteria'] = criteria_disease_tags context['features'] = [marker_doc] context['old_dbsnp_docs'] = _get_old_dbsnps(marker) context['ic'] = ic_docs context['history'] = history_docs context['title'] = title context['jbrowse_tracks'] = "PydginRegions%2Cdbsnp146%2CEnsemblGenes" return context elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def _build_score_functions(idx_dict): ''' Build an array of ScoreFunction instances for boosting query results. ''' # create function score query to return documents with greater weights. score_fns = [ScoreFunction.create_score_function('field_value_factor', field='tags.weight', missing=1.0)] # create a function score that increases the score of markers. if ElasticSettings.idx('MARKER') is not None and ElasticSettings.idx('MARKER') in idx_dict['idx']: type_filter = Filter(Query({"type": {"value": ElasticSettings.get_idx_types('MARKER')['MARKER']['type']}})) score_fns.append(ScoreFunction.create_score_function('weight', 2, function_filter=type_filter.filter)) logger.debug("Add marker type score function.") # create a function score that increases the score of publications tagged with disease. if ElasticSettings.idx('PUBLICATION') is not None and ElasticSettings.idx('PUBLICATION') in idx_dict['idx']: score_fns.append(ScoreFunction.create_score_function('weight', 2, function_filter=ExistsFilter('tags.disease').filter)) logger.debug("Add publication disease tag score function.") return score_fns
def test_hit_attributes(self): '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query''' for idx_type_key in RegionDataTest.IDX_TYPE_KEYS: idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, idx_type_key) (idx, idx_type) = idx.split('/') docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
def check_hits(resp_json): rsids = {} docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc in docs: rsid = getattr(doc, "id") if rsid is not None: rsids[rsid] = doc rsids_keys = list(rsids.keys()) terms_filter = TermsFilter.get_terms_filter("id", rsids_keys) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys)) docs_by_rsid = elastic.search().docs for doc in docs_by_rsid: info = getattr(doc, "info") if 'VC=SNV' not in info: continue rsid = getattr(doc, "id") ic_doc = rsids[rsid] pos1 = getattr(doc, "start") pos2 = self._get_highest_build(ic_doc)['position'] if abs(int(pos1) - int(pos2)) > 1: is_par = getattr(ic_doc, 'is_par') allele_a = getattr(ic_doc, 'allele_a') if is_par is None and not (allele_a == 'D' or allele_a == 'I'): msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') + ' '+str(pos2)+" "+rsid+' '+str(pos1)) # ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')' query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')), Filter(Query.term("start", pos2))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")" query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " (rshigh:"+str(getattr(d, "rshigh")) + \ " build_id:"+str(getattr(d, "build_id"))+")" logger.error(msg)
def test_bool_filtered_query(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)], should_arr=[RangeQuery("start", gte=10050)]) query_bool.must([Query.term("id", "rs768019142")]) \ .should(RangeQuery("start", gte=10054)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def range_overlap_query(cls, seqid, start_range, end_range, search_from=0, size=20, idx=ElasticSettings.idx('DEFAULT'), field_list=None, seqid_param="seqid", start_param="start", end_param="end"): ''' Constructs a range overlap query ''' from elastic import utils query = utils.ElasticUtils.range_overlap_query(seqid, start_range, end_range, field_list=field_list, seqid_param=seqid_param, start_param=start_param, end_param=end_param) return cls(search_query=query, search_from=search_from, size=size, idx=idx)
def test_and_filtered_query(self): ''' Test building and running a filtered query. ''' query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)]) and_filter = AndFilter(query_bool) and_filter.extend(RangeQuery("start", gte=1)) \ .extend(Query.term("seqid", 1)) query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_url_rotate(self): ''' Test the url rotates from http://xxx:9200 to correct url. ''' query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142"))) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker") Search.index_exists('test', 'test2') ElasticUrl.URL_INDEX = 0 # reset
def test_fetch_overlapping_features(self): region_index = ElasticSettings.idx('REGION', idx_type='STUDY_HITS') (region_idx, region_idx_type) = region_index.split('/') seqid = '1' start = 206767602 stop = 206772494 result_docs = Criteria.fetch_overlapping_features('38', seqid, start, stop, region_idx, region_idx_type) self.assertTrue(len(result_docs) > 0, 'Got some overlapping features')
def region_page(request, region): ''' Region elastic''' query = ElasticQuery.query_match("attr.region_id", region) elastic = Search(query, idx=ElasticSettings.idx(name='REGION')) context = elastic.get_result() context['title'] = "Region" print(context) return render(request, 'region/region.html', context, content_type='text/html')
def test_terms_avg_order(self): ''' Test average and order. ''' agg_name = "test" sub_agg = Agg('avg_start', 'avg', {"field": "start"}) agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0, "order": {"avg_start": "desc"}}, sub_agg=sub_agg) search = Search(aggs=Aggs(agg), idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") self.assertGreater(r_aggs['test'].get_buckets()[0]['doc_count'], 1)
def __init__(self, search_query=None, aggs=None, search_from=0, size=20, search_type=None, idx=ElasticSettings.idx('DEFAULT'), idx_type='', qsort=None, elastic_url=None): ''' Set up parameters to use in the search. L{ElasticQuery} is used to define a search query. @type search_query: L{ElasticQuery} @keyword search_query: The elastic query to search (default: None). @type aggs: L{Aggs} @keyword aggs: Aggregations used in the search. @type search_from: integer @keyword search_from: Offset used in paginations (default: 0). @type size: integer @keyword size: maximum number of hits to return (default: 20). @type search_type: bool @keyword search_type: Set search type = count for aggregations. @type idx: string @keyword idx: index to search (default: default index defined in settings). @type idx_type: string @keyword idx_type: index type (default: ''). @type qsort: Sort @keyword qsort: defines sorting for the query. @type url: string @keyword url: Elastic URL (default: default cluster URL). ''' if search_query is not None: if not isinstance(search_query, ElasticQuery): raise QueryError("not an ElasticQuery") self.query = search_query.query if aggs is not None: if hasattr(self, 'query'): self.query.update(aggs.aggs) else: self.query = aggs.aggs if qsort is not None: if not isinstance(qsort, Sort): raise QueryError("not a Sort") if hasattr(self, 'query'): self.query.update(qsort.qsort) else: logger.error("no query to sort") if elastic_url is None: elastic_url = ElasticSettings.url() self.size = size self.search_from = search_from self.search_type = search_type self.idx = idx self.idx_type = idx_type self.elastic_url = elastic_url if self.search_type is None: self.url = (self.idx + '/' + self.idx_type + '/_search?size=' + str(self.size) + '&from='+str(self.search_from)) else: self.url = (self.idx + '/' + self.idx_type + '/_search?search_type='+search_type)
def get_random_feature_id(self): config = IniParser().read_ini(MY_INI_FILE) idx = ElasticSettings.idx('GENE_CRITERIA') available_criterias = GeneCriteria.get_available_criterias(config=config)['gene'] idx_type = ','.join(available_criterias) doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1) self.assertTrue(len(doc_by_idx_type) > 0) feature_id = getattr(doc_by_idx_type[0], 'qid') return feature_id
def test_terms_query(self): ''' Test building and running a match query. ''' highlight = Highlight(["id"]) query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved markers (rs2476601, rs768019142)") self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found") self.assertTrue(docs[0].highlight() is not None, "highlighting found")
def test_bool_filtered_query2(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.term("seqid", 1)) query_string = Query.query_string("rs768019142", fields=["id", "seqid"]) query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_filters(self): ''' Filters Aggregation ''' filters = {'filters': {'start_gt': RangeQuery('start', gt='1000'), 'start_lt': RangeQuery('start', lt='100000')}} agg = Agg('test_filters', 'filters', filters) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('start_lt' in r_aggs['test_filters'].get_buckets(), "returned avg aggregation")
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_range(self): ''' Range Aggregation ''' agg = Agg("test_range_agg", "range", {"field": "start", "ranges": [{"to": 10000}, {"from": 10000, "to": 15000}]}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(len(r_aggs['test_range_agg'].get_buckets()) == 2, "returned two buckets in range aggregations")
def test_bool_query(self): ''' Test a bool query. ''' query_bool = BoolQuery() highlight = Highlight(["id", "seqid"]) query_bool.must(Query.term("id", "rs768019142")) \ .must(RangeQuery("start", gt=1000)) \ .must_not(Query.match("seqid", "2")) \ .should(Query.match("seqid", "3")) \ .should(Query.match("seqid", "1")) query = ElasticQuery.bool(query_bool, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
def test_function_score_query(self): ''' Test a function score query with a query (using the start position as the score). ''' score_function = ScoreFunction.create_score_function('field_value_factor', field='start', modifier='reciprocal') query_string = Query.query_string("rs*", fields=["id", "seqid"]) query = ElasticQuery(FunctionScoreQuery(query_string, [score_function], boost_mode='replace')) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = 0 for doc in docs: start = getattr(doc, 'start') self.assertLess(last_start, start) last_start = start