def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def gene_mgi_parse(cls, gene_pubs, idx): ''' Parse Ensembl and MGI data from JAX. ''' orthogenes_mgi = {} for gene_mgi in gene_pubs: parts = gene_mgi.split('\t') if 'MGI:' not in parts[0]: raise PipelineError('MGI not found '+parts[0]) if 'ENSMUSG' not in parts[5]: raise PipelineError('ENSMUSG not found '+parts[5]) orthogenes_mgi[parts[5]] = parts[0].replace('MGI:', '') orthogene_keys = list(orthogenes_mgi.keys()) chunk_size = 450 for i in range(0, len(orthogene_keys), chunk_size): chunk_gene_keys = orthogene_keys[i:i+chunk_size] json_data = '' query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", chunk_gene_keys)) docs = Search(query, idx=idx, size=chunk_size).search().docs for doc in docs: ens_id = doc.doc_id() idx_type = doc.type() mm = getattr(doc, 'dbxrefs')['orthologs']['mmusculus'] mm['MGI'] = orthogenes_mgi[mm['ensembl']] dbxrefs = {"dbxrefs": {'orthologs': {"mmusculus": mm}}} doc_data = {"update": {"_id": ens_id, "_type": idx_type, "_index": idx, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': dbxrefs}) + '\n' if json_data != '': Loader().bulk_load(idx, idx_type, json_data)
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) search_filters = self._build_filters(filters=filters) if search_filters is not None: q = ElasticQuery.filtered(Query.match_all(), search_filters) else: q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=getattr(view, 'idx'), size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def _check_gene_history(cls, gene_sets, config): '''find a way to handle this better''' section = config['GENE_HISTORY'] newgene_ids = {} discountinued_geneids = [] def process_hits(resp_json): hits = resp_json['hits']['hits'] docs = [Document(hit) for hit in hits] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets), sources=['geneid', 'discontinued_geneid']) ScanAndScroll.scan_and_scroll(section['index'], idx_type=section['index_type'], call_fun=process_hits, query=query) return (newgene_ids, discountinued_geneids)
def _ensembl_entrez_lookup(cls, ensembl_gene_sets, section): ''' Get an ensembl:entrez id dictionary. ''' equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.ensembl", ensembl_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) docs = Search(equery, idx=section['index'], size=len(ensembl_gene_sets)).search().docs return {doc.doc_id(): getattr(doc, 'dbxrefs')['entrez'] for doc in docs}
def check_hits(resp_json): rsids = {} docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc in docs: rsid = getattr(doc, "id") if rsid is not None: rsids[rsid] = doc rsids_keys = list(rsids.keys()) terms_filter = TermsFilter.get_terms_filter("id", rsids_keys) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys)) docs_by_rsid = elastic.search().docs for doc in docs_by_rsid: info = getattr(doc, "info") if 'VC=SNV' not in info: continue rsid = getattr(doc, "id") ic_doc = rsids[rsid] pos1 = getattr(doc, "start") pos2 = self._get_highest_build(ic_doc)['position'] if abs(int(pos1) - int(pos2)) > 1: is_par = getattr(ic_doc, 'is_par') allele_a = getattr(ic_doc, 'allele_a') if is_par is None and not (allele_a == 'D' or allele_a == 'I'): msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') + ' '+str(pos2)+" "+rsid+' '+str(pos1)) # ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')' query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')), Filter(Query.term("start", pos2))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")" query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " (rshigh:"+str(getattr(d, "rshigh")) + \ " build_id:"+str(getattr(d, "build_id"))+")" logger.error(msg)
def test_and_filtered_query(self): ''' Test building and running a filtered query. ''' query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)]) and_filter = AndFilter(query_bool) and_filter.extend(RangeQuery("start", gte=1)) \ .extend(Query.term("seqid", 1)) query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_url_rotate(self): ''' Test the url rotates from http://xxx:9200 to correct url. ''' query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142"))) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker") Search.index_exists('test', 'test2') ElasticUrl.URL_INDEX = 0 # reset
def _entrez_ensembl_lookup(cls, gene_sets, section, config=None): ''' Get an entrez:ensembl id dictionary. ''' (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config) replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) docs = Search(equery, idx=section['index'], size=len(replaced_gene_sets)).search().docs return {getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def _build_frags_query(frags_idx, chrom, segmin, segmax): query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.bedFields) fragsQuery = Search(search_query=query, search_from=0, size=2000000, idx=frags_idx) fragsResult = fragsQuery.get_result() frags = fragsResult['data'] frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags) return frags
def post(self, request, *args, **kwargs): ens_id = self.request.POST.get('ens_id') marker = self.request.POST.get('marker') markers = self.request.POST.getlist('markers[]') if ens_id: sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) elif marker: sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap()) elif markers: sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500) study_hits = elastic.get_json_response()['hits'] ens_ids = [] pmids = [] for hit in study_hits['hits']: if 'pmid' in hit['_source']: pmids.append(hit['_source']['pmid']) if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal']) for hit in study_hits['hits']: genes = {} if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: try: genes[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genes = {ens_id: ens_id} hit['_source']['genes'] = genes if 'pmid' in hit['_source']: pmid = hit['_source']['pmid'] try: authors = getattr(pub_docs[pmid], 'authors') journal = getattr(pub_docs[pmid], 'journal') hit['_source']['pmid'] = \ {'pmid': pmid, 'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "", 'journal': journal} except KeyError: hit['_source']['pmid'] = {'pmid': pmid} return JsonResponse(study_hits)
def _build_frags_query(frags_idx, chrom, segmin, segmax): query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.bedFields) fragsQuery = Search(search_query=query, search_from=0, size=10000, idx=frags_idx) # fragsResult = fragsQuery.get_result() # frags = fragsResult['data'] fragsResult = fragsQuery.get_json_response() frags = [] for hit in fragsResult['hits']['hits']: frags.append(hit['_source']) frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags) return frags
def _entrez_ensembl_lookup(cls, gene_sets, section, config=None): ''' Get an entrez:ensembl id dictionary. ''' (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config) replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) lookup = {} def process_hits(resp_json): hits = resp_json['hits']['hits'] docs = [Document(hit) for hit in hits] lookup.update({getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}) equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) ScanAndScroll.scan_and_scroll(section['index'], call_fun=process_hits, query=equery) return lookup
def _check_gene_history(cls, gene_sets, section): query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets)) docs = Search(query, idx=section['index'], idx_type=section['index_type_history'], size=1000000).search().docs newgene_ids = {} discountinued_geneids = [] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) return (newgene_ids, discountinued_geneids)
def range_overlap_query(cls, seqid, start_range, end_range, field_list=None, seqid_param="seqid", start_param="start", end_param="end"): ''' Constructs a range overlap query ''' query_bool = BoolQuery(must_arr=[ RangeQuery(start_param, lte=start_range), RangeQuery(end_param, gte=end_range) ]) or_filter = OrFilter( RangeQuery(start_param, gte=start_range, lte=end_range)) or_filter.extend(RangeQuery(end_param, gte=start_range, lte=end_range)) \ .extend(query_bool) return ElasticQuery.filtered(Query.term(seqid_param, seqid), or_filter, field_list)
def _check_gene_history(cls, gene_sets, config): '''find a way to handle this better''' section = config['GENE_HISTORY'] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets), sources=['geneid', 'discontinued_geneid']) docs = Search(query, idx=section['index'], idx_type=section['index_type'], size=len(gene_sets)).search().docs newgene_ids = {} discountinued_geneids = [] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) return (newgene_ids, discountinued_geneids)
def _update_gene(cls, genes, idx): ''' Use genes data to update the index. ''' gene_keys = list(genes.keys()) chunk_size = 450 for i in range(0, len(genes), chunk_size): chunk_gene_keys = gene_keys[i:i+chunk_size] json_data = '' query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", chunk_gene_keys)) docs = Search(query, idx=idx, size=chunk_size).search().docs for doc in docs: ens_id = doc._meta['_id'] idx_type = doc.type() entrez = getattr(doc, 'dbxrefs')['entrez'] doc_data = {"update": {"_id": ens_id, "_type": idx_type, "_index": idx, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': genes[entrez]}) + '\n' if json_data != '': Loader().bulk_load(idx, idx_type, json_data)
def studies_details(request): """ Get studies for a given ensembl ID. """ ens_id = request.POST.get("ens_id") sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx("REGION", "STUDY_HITS"), size=500) study_hits = elastic.get_json_response()["hits"] ens_ids = [] pmids = [] for hit in study_hits["hits"]: if "pmid" in hit["_source"]: pmids.append(hit["_source"]["pmid"]) for ens_id in hit["_source"]["genes"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) pub_docs = _get_pub_docs_by_pmid(pmids, sources=["authors.name", "journal"]) for hit in study_hits["hits"]: genes = {} for ens_id in hit["_source"]["genes"]: try: genes[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genes = {ens_id: ens_id} hit["_source"]["genes"] = genes if "pmid" in hit["_source"]: pmid = hit["_source"]["pmid"] try: authors = getattr(pub_docs[pmid], "authors") journal = getattr(pub_docs[pmid], "journal") hit["_source"]["pmid"] = { "pmid": pmid, "author": authors[0]["name"].rsplit(None, 1)[-1], "journal": journal, } except KeyError: hit["_source"]["pmid"] = {"pmid": pmid} return JsonResponse(study_hits)
def genesets_details(request): """ Get pathway gene sets for a given ensembl ID. """ ens_id = request.POST.get("ens_id") geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx("GENE", "PATHWAY"), size=500) genesets_hits = elastic.get_json_response()["hits"] ens_ids = [] for hit in genesets_hits["hits"]: for ens_id in hit["_source"]["gene_sets"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) for hit in genesets_hits["hits"]: genesets = {} for ens_id in hit["_source"]["gene_sets"]: try: genesets[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genesets[ens_id] = ens_id hit["_source"]["gene_sets"] = genesets return JsonResponse(genesets_hits)
def get_new_pmids(cls, pmids, idx, disease_code=None): ''' Find PMIDs in a list that are not in the elastic index. ''' chunk_size = 800 pmids_found = set() pmids_found_add = pmids_found.add time.sleep(5) for i in range(0, len(pmids), chunk_size): pmids_slice = pmids[i:i+chunk_size] terms_filter = TermsFilter.get_terms_filter("pmid", pmids_slice) query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=['pmid', 'tags']) docs = Search(query, idx=idx, size=chunk_size).search().docs json_data = '' for doc in docs: pmids_found_add(getattr(doc, 'pmid')) if disease_code is not None: tags = getattr(doc, 'tags') if 'disease' in tags: disease = tags['disease'] else: disease = [] if disease_code not in disease: # update disease attribute disease.append(disease_code) tags['disease'] = disease idx_name = doc._meta['_index'] idx_type = doc.type() doc_data = {"update": {"_id": doc._meta['_id'], "_type": idx_type, "_index": idx_name, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': {'tags': tags}}) + '\n' if json_data != '': Loader().bulk_load(idx_name, idx_type, json_data) return [pmid for pmid in pmids if pmid not in pmids_found]
def genesets_details(request): ''' Get pathway gene sets for a given ensembl ID. ''' ens_id = request.POST.get('ens_id') geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx('GENE', 'PATHWAY'), size=500) genesets_hits = elastic.get_json_response()['hits'] ens_ids = [] for hit in genesets_hits['hits']: for ens_id in hit['_source']['gene_sets']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) for hit in genesets_hits['hits']: genesets = {} for ens_id in hit['_source']['gene_sets']: try: genesets[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genesets[ens_id] = ens_id hit['_source']['gene_sets'] = genesets return JsonResponse(genesets_hits)
def _convert_entrezid2ensembl(cls, gene_sets, section, log_output_file_handler=None, log_conversion=True): '''Converts given set of entrez ids to ensembl ids by querying the gene index dbxrefs''' # first check in gene_history (newgene_ids, discontinued_ids) = cls._check_gene_history(gene_sets, section) # replace all old ids with new ids replaced_gene_sets = cls._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets)) docs = Search(query, idx=section['index'], size=1000000).search().docs ensembl_ids = [] for doc in docs: ens_id = doc._meta['_id'] ensembl_ids.append(ens_id) if log_conversion: if log_output_file_handler is not None: cls._log_entrezid2ensembl_coversion(replaced_gene_sets, ensembl_ids, log_output_file_handler) return ensembl_ids
def _build_snp_query(snp_track, chrom, segmin, segmax): snps = [] snpMeta = {} maxScore = -1 if snp_track and snp_track != 'None': # get SNPs based on this segment mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) snp_track_idx = getattr(chicp_settings, 'CHICP_IDX').get(group).get('INDEX') snp_track_type = '' if getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS').get(snp_track): snp_track_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS') \ .get(snp_track).get('TYPE') else: snp_track_type = track query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.snpFields) snpQuery = Search(search_query=query, search_from=0, size=2000000, idx=snp_track_idx+'/'+snp_track_type) snpResult = snpQuery.get_result() snps = snpResult['data'] snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps) data_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('DATA_TYPE') snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type) # if 'max' in snpSettings: # maxScore = float(snpSettings['max']) # else: for s in snps: if float(s['score']) > maxScore: maxScore = float(s['score']) snpSettings['max'] = maxScore snpMeta = snpSettings return snps, snpMeta
def check_hits(resp_json): self.assertTrue('hits' in resp_json, 'scan and scroll hits') self.assertGreaterEqual(len(resp_json['hits']['hits']), 1) docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc1 in docs: doc_internal_id = getattr(doc1, "internal_id") if doc_internal_id in internal_id: pos1 = self._get_highest_build(doc1) for doc2 in internal_id[doc_internal_id]: pos2 = self._get_highest_build(doc2) if pos2['position'] != pos1['position']: msg = ("DIFFERENT POSITIONS ID: "+str(doc_internal_id)+":\t" + str(getattr(doc1, "name"))+": "+pos1['position']+" ("+doc1.doc_id()+")\t" + str(getattr(doc2, "name"))+": "+pos2['position']+" ("+doc2.doc_id()+")\t") try: terms_filter = TermsFilter.get_terms_filter("start", [pos1['position'], pos2['position']]) query = ElasticQuery.filtered(Query.term("seqid", pos1['seqid']), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs found = False for d in docs_by_pos: msg += getattr(d, "id")+": "+str(getattr(d, "start"))+"\t" if getattr(d, "id") == 'rs'+str(doc_internal_id): found = True if not found: msg += 'rs'+str(doc_internal_id) if self._rs_exists('rs'+str(doc_internal_id)): msg += ' EXISTS IN DBSNP\t' else: msg += ' NOT IN DBSNP\t' logger.error(msg) except KeyError: logger.error(msg) internal_id[doc_internal_id].append(doc1) else: internal_id[doc_internal_id] = [doc1]
def _build_snp_query(snp_track, chrom, segmin, segmax): snps = [] snpMeta = {} maxScore = -1 if snp_track and snp_track != 'None': # get SNPs based on this segment mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) try: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper()) except SettingsError: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.snpFields) snpQuery = Search(search_query=query, search_from=0, size=10000, idx=snp_track_idx) # snpResult = snpQuery.get_result() # snps = snpResult['data'] snpResult = snpQuery.get_json_response() snps = [] for hit in snpResult['hits']['hits']: snps.append(hit['_source']) snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps) data_type = ElasticSettings.get_label('CP_STATS_'+group.upper(), None, "data_type") snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type) for s in snps: if float(s['score']) > maxScore: maxScore = float(s['score']) snpSettings['max'] = maxScore snpMeta = snpSettings return snps, snpMeta
def filter_queryset(self, request, queryset, view): ''' Get disease regions. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) dis = filters.get('disease', 'T1D') show_genes = filters.get('genes', False) show_markers = filters.get('markers', False) show_regions = filters.get('regions', True) build = self._get_build(filters.get('build', settings.DEFAULT_BUILD)) docs = DiseaseLocusDocument.get_disease_loci_docs(dis) if len(docs) == 0: messages.error(request, 'No regions found for '+dis+'.') visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')]) regions = [] all_markers = [] all_genes = [] ens_all_cand_genes = [] for r in docs: region = r.get_disease_region(visible_hits, build=build) if region is not None: ens_all_cand_genes.extend(region['ens_cand_genes']) all_markers.extend(region['markers']) region['hits'] = [self._study_hit_obj(s, region) for s in StudyHitDocument.process_hits(r.hit_docs, region['all_diseases'])] (all_coding, all_non_coding) = views.get_genes_for_region(getattr(r, "seqid"), region['rstart']-500000, region['rstop']+500000) (region_coding, coding_up, coding_down) = views._region_up_down(all_coding, region['rstart'], region['rstop']) (region_non_coding, non_coding_up, non_coding_down) = \ views._region_up_down(all_non_coding, region['rstart'], region['rstop']) region['genes'] = { 'upstream': {'coding': [g.doc_id() for g in coding_up], 'non_coding': [g.doc_id() for g in non_coding_up]}, 'region': {'coding': [g.doc_id() for g in region_coding], 'non_coding': [g.doc_id() for g in region_non_coding]}, 'downstream': {'coding': [g.doc_id() for g in coding_down], 'non_coding': [g.doc_id() for g in non_coding_down]}, } all_genes.extend(region['genes']['region']['coding']) all_genes.extend(region['genes']['region']['non_coding']) regions.append(region) # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers), Filter(RangeQuery("p_value", lte=5E-08))) stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping', is_post=False) # get ensembl to gene symbol mapping for all candidate genes extra_markers = [] for region in regions: # add diseases from IC/GWAS stats (study_ids, region['marker_stats']) = views._process_stats(stats_docs, region['markers'], meta_response) region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']]) other_hits_query = ElasticQuery( BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])], must_not_arr=[Query.terms("dil_study_id", study_ids)])) other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=100).search() region['extra_markers'] = [self._study_hit_obj(s, region) for s in StudyHitDocument.process_hits(other_hits.docs, region['all_diseases'])] region['all_diseases'] = list(set(region['all_diseases'])) extra_markers.extend([m['marker_id'] for m in region['extra_markers']]) # get markers marker_objs = [] if show_markers: query = ElasticQuery(Query.terms("id", all_markers), sources=['id', 'start']) marker_docs = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(all_markers)).search().docs mids = {getattr(m, 'id'): getattr(m, 'start') for m in marker_docs} marker_objs = [h for r in regions for h in r['hits']] marker_objs.extend([h for r in regions for h in r['extra_markers']]) for m in marker_objs: m['start'] = mids[m['marker_id']] # get genes gene_objs = [] if show_genes: all_genes.extend(ens_all_cand_genes) gene_docs = GeneDocument.get_genes(all_genes, sources=['start', 'stop', 'chromosome', 'symbol', 'biotype']) for doc in Document.sorted_alphanum(gene_docs, 'chromosome'): ensembl_id = doc.doc_id() region_name = '' candidate_gene = 0 for region in regions: if ('genes' in region and (ensembl_id in region['genes']['region']['coding'] or ensembl_id in region['genes']['region']['non_coding'] or ensembl_id in region['ens_cand_genes'])): region_name = region['region_name'] candidate_gene = 1 if ensembl_id in region['ens_cand_genes'] else 0 break gene_objs.append({ 'ensembl_id': ensembl_id, 'seqid': 'chr'+getattr(doc, 'chromosome'), 'start': getattr(doc, 'start'), 'end': getattr(doc, 'stop'), 'symbol': getattr(doc, 'symbol'), 'biotype': getattr(doc, 'biotype'), 'region_name': region_name, 'candidate_gene': candidate_gene }) if show_regions == 'false': regions = [] regions.extend(gene_objs) regions.extend(marker_objs) return regions except (TypeError, ValueError, IndexError, ConnectionError) as e: print(e) raise Http404
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', {'username': '******', 'password': '******'}) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get(id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names : group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names : group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter("field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")
def test_not_filtered_query(self): ''' Test building and running a filtered query. ''' not_filter = NotFilter(RangeQuery("start", lte=10000)) query = ElasticQuery.filtered(Query.term("seqid", 1), not_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def get_regions(cls, request, dis, context): # is_authenticated = False elastic_url = ElasticSettings.url() (core, other) = Disease.get_site_diseases(dis_list=dis.upper().split(',')) if len(core) == 0 and len(other) == 0: messages.error(request, 'Disease '+dis+' not found.') raise Http404() disease = core[0] if len(core) > 0 else other[0] context['title'] = getattr(disease, "name")+" Regions" docs = DiseaseLocusDocument.get_disease_loci_docs(dis) if len(docs) == 0: messages.error(request, 'No regions found for '+dis+'.') raise Http404() visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')]) meta_response = Search.elastic_request(elastic_url, ElasticSettings.idx("IC_STATS") + '/_mapping', is_post=False) regions = [] ens_all_cand_genes = [] all_markers = [] for r in docs: region = r.get_disease_region(visible_hits) if region is not None: ens_all_cand_genes.extend(region['ens_cand_genes']) all_markers.extend(region['markers']) region['hits'] = StudyHitDocument.process_hits(r.hit_docs, region['all_diseases']) (all_coding, all_non_coding) = get_genes_for_region(getattr(r, "seqid"), region['rstart']-500000, region['rstop']+500000) (region_coding, coding_up, coding_down) = _region_up_down(all_coding, region['rstart'], region['rstop']) (region_non_coding, non_coding_up, non_coding_down) = \ _region_up_down(all_non_coding, region['rstart'], region['rstop']) region['genes'] = { 'upstream': {'coding': coding_up, 'non_coding': non_coding_up}, 'region': {'coding': region_coding, 'non_coding': region_non_coding}, 'downstream': {'coding': coding_down, 'non_coding': non_coding_down}, } regions.append(region) # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers), Filter(RangeQuery("p_value", lte=5E-08))) stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs # get ensembl to gene symbol mapping for all candidate genes all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes) for region in regions: region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)} (study_ids, region['marker_stats']) = _process_stats(stats_docs, region['markers'], meta_response) # add diseases from IC/GWAS stats region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']]) other_hits_query = ElasticQuery( BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])], must_not_arr=[Query.terms("dil_study_id", study_ids)])) other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=100).search() region['extra_markers'] = StudyHitDocument.process_hits(other_hits.docs, region['all_diseases']) context['regions'] = regions context['disease_code'] = [dis] context['disease'] = getattr(disease, "name") return context
def test_term_filtered_query(self): ''' Test filtered query with a term filter. ''' query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142"))) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker")
def test_terms_filtered_query(self): ''' Test filtered query with a terms filter. ''' terms_filter = TermsFilter.get_terms_filter("id", ["rs2476601", "rs768019142"]) query = ElasticQuery.filtered(Query.term("seqid", 1), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_type_filtered_query(self): ''' Test filtered query with a type filter. ''' type_filter = Filter(Query.query_type_for_filter("marker")) query = ElasticQuery.filtered(Query.term("seqid", 1), type_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def chicpeaSearch(request, url): queryDict = request.GET user = request.user targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() searchTerm = searchTerm.replace(",", "") searchTerm = searchTerm.replace("..", "-") searchTerm = searchTerm.replace(" ", "") # Chris suggestion to prevent issue with spaces in queries snpTrack = queryDict.get("snp_track") (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( user=user, idx_keys=None, idx_type_keys=None) if snpTrack: mo = re.match(r"(.*)-(.*)", snpTrack) (group, track) = mo.group(1, 2) # @UnusedVariable if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth: snpTrack = None if targetIdx not in utils.tissues: for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: if targetIdx == target: retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'} return JsonResponse(retJSON) continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm): searchType = 'region' region = searchTerm if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") chrom = chrom.replace('CHR', "") if re.search("^rs[0-9]+", searchTerm.lower()): searchTerm = searchTerm.lower() addList.append(_find_snp_position(snpTrack, searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm").replace(",", "")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # @UnusedVariable if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"]) geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap())) resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search() if resultObj.hits_total > 1: geneResults = [] resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search() docs = resultObj2.docs gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids)) agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0}) res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg), size=0).search() ensg_count = res.aggs['ensg_agg'].get_buckets() gene_ids = [g['key'] for g in ensg_count] for d in resultObj2.docs: if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids: geneResults.append({ 'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''), 'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''), 'location': "chr" + getattr(d, "seqid") + ":" + locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." + locale.format_string("%d", getattr(d, "end"), grouping=True), }) if len(geneResults) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) elif len(geneResults) > 1: retJSON = { 'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.', 'results': geneResults, 'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location'] } return JsonResponse(retJSON) query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, segmin, segmax) = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues['CP_TARGET_'+targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response
def test_gene_pipeline(self): """ Test gene pipeline. """ INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["ENSEMBL_GENE_GTF"]["index"] idx_type = INI_CONFIG["ENSEMBL_GENE_GTF"]["index_type"] """ 1. Test ensembl GTF loading. """ call_command( "pipeline", "--steps", "stage", "load", sections="ENSEMBL_GENE_GTF", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) elastic = Search(idx=idx, idx_type=idx_type) self.assertGreaterEqual(elastic.get_count()["count"], 1, "Count documents in the index") map1_props = Gene.gene_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type) """ 2. Test adding entrez ID to documents """ call_command("pipeline", "--steps", "load", sections="GENE2ENSEMBL", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertEqual(len(docs), 1) self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertEqual(getattr(docs[0], "dbxrefs")["entrez"], "26191") """ 3. Add uniprot and fill in missing entrez fields. """ call_command( "pipeline", "--steps", "download", "load", sections="ENSMART_GENE", dir=TEST_DATA_DIR, ini=MY_INI_FILE ) Search.index_refresh(idx) query = ElasticQuery.query_string("DNMT3L", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("entrez" in getattr(docs[0], "dbxrefs")) self.assertTrue("swissprot" in getattr(docs[0], "dbxrefs")) """ 4. Add gene synonyms and dbxrefs. """ call_command("pipeline", "--steps", "load", sections="GENE_INFO", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertTrue("PTPN8" in getattr(docs[0], "synonyms")) """ 5. Add PMIDs to gene docs. """ call_command("pipeline", "--steps", "load", sections="GENE_PUBS", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertGreater(len(getattr(docs[0], "pmids")), 0) """ 6. Add ortholog data. """ call_command("pipeline", "--steps", "load", sections="ENSMART_HOMOLOG", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) query = ElasticQuery.query_string("PTPN22", fields=["symbol"]) elastic = Search(query, idx=idx) docs = elastic.search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertTrue("orthologs" in dbxrefs, dbxrefs) self.assertTrue("mmusculus" in dbxrefs["orthologs"], dbxrefs) self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) query = ElasticQuery.filtered( Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", ["ENSMUSG00000027843"]), ) docs = Search(query, idx=idx, size=1).search().docs self.assertEqual(len(docs), 1) """ 7. Add mouse ortholog link to MGI """ call_command("pipeline", "--steps", "load", sections="ENSEMBL2MGI", dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) docs = Search(query, idx=idx, size=1).search().docs dbxrefs = getattr(docs[0], "dbxrefs") self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"]) self.assertEqual("107170", dbxrefs["orthologs"]["mmusculus"]["MGI"])
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', { 'username': '******', 'password': '******' }) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get( id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter( "field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")
def test_exists_filtered_query(self): ''' Test building and running a filtered query. ''' exists_filter = ExistsFilter("start") query = ElasticQuery.filtered(Query.term("seqid", 1), exists_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def get_disease(cls, request, disease, context): disease = disease.lower() if disease is None: messages.error(request, 'No disease given.') raise Http404() query = ElasticQuery(Query.terms("code", [disease.split(',')])) elastic = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE'), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, 'Disease(s) '+disease+' not found.') elif res.hits_total < 9: disease_docs = res.docs names = ', '.join([getattr(doc, 'name') for doc in disease_docs]) meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping', is_post=False) elastic_meta = json.loads(meta_response.content.decode("utf-8")) disease_docs = res.docs for dis in disease_docs: dis_code = getattr(dis, 'code').upper() docs = DiseaseLocusDocument.get_disease_loci_docs(dis_code) regions = [] ens_all_cand_genes = [] all_markers = [] for r in docs: region = r.get_disease_region() if region is not None: regions.append(region) ens_all_cand_genes.extend(region['ens_cand_genes']) all_markers.extend(region['markers']) # get ensembl to gene symbol mapping for all candidate genes all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes) for region in regions: region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)} setattr(dis, 'regions', regions) # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers), Filter(RangeQuery("p_value", lte=5E-08)), sources=['marker']) stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs other_hits_query = ElasticQuery( BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", all_markers)]), sources=['marker', 'disease']) other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=5000).search().docs for region in regions: diseases = [dis_code] for doc in stats_docs: if getattr(doc, 'marker') in region['markers']: meta_info = elastic_meta[doc.index()]['mappings'][doc.type()]['_meta'] if meta_info['disease'] not in diseases: diseases.append(meta_info['disease']) for doc in other_hits: if getattr(doc, 'marker') in region['markers']: if doc.disease is not None and doc.disease not in diseases: diseases.append(doc.disease) region['diseases'] = diseases studies = StudyDocument.get_studies(disease_code=dis_code) for doc in studies: setattr(doc, 'study_id', getattr(doc, 'study_id').replace('GDXHsS00', '')) pmid = getattr(doc, 'principal_paper') pubs = PublicationDocument.get_publications(pmid, sources=['date', 'authors.name', 'journal']) if len(pubs) > 0: authors = getattr(pubs[0], 'authors') setattr(doc, 'date', getattr(pubs[0], 'date')) setattr(doc, 'journal', getattr(pubs[0], 'journal')) setattr(doc, 'author', authors[0]['name'].rsplit(None, 1)[-1] if authors else "") setattr(dis, 'studies', studies) context['features'] = disease_docs context['title'] = names return context raise Http404()
else: features[qid] = src['score'] update_count += len(hits) print(str(update_count)+" of "+str(resp_json['hits']['total'])) scan_n_sroll = ScanAndScroll.scan_and_scroll(criteria_idx, call_fun=process_hits, idx_type=criteria_idx_type) # Update suggest weights update_count = 0 chunk_size = 2000 feature_names = list(features.keys()) for i in range(0, len(feature_names), chunk_size): feature_names_slice = feature_names[i:i+chunk_size] terms_filter = TermsFilter.get_terms_filter(feature_id, feature_names_slice) query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=[feature_id, 'suggest']) docs = Search(query, idx=idx, idx_type=idx_type, size=chunk_size).search().docs json_data = '' for doc in docs: doc_id = doc.doc_id() if feature_id.startswith('dbxrefs'): obj_id = getattr(doc, 'dbxrefs')['ensembl'] else: obj_id = getattr(doc, feature_id) doc_data = {"update": {"_id": doc_id, "_type": idx_type, "_index": idx, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' suggest = getattr(doc, 'suggest') if suggest is None:
def chicpeaSearch(request, url): queryDict = request.GET user = request.user targetIdx = queryDict.get("targetIdx") blueprint = {} hic = [] addList = [] searchType = 'gene' searchTerm = queryDict.get("searchTerm").upper() searchTerm = searchTerm.replace(",", "") searchTerm = searchTerm.replace("..", "-") snpTrack = queryDict.get("snp_track") (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( user=user, idx_keys=None, idx_type_keys=None) if snpTrack: mo = re.match(r"(.*)-(.*)", snpTrack) (group, track) = mo.group(1, 2) # @UnusedVariable if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth: snpTrack = None if targetIdx not in utils.tissues: for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: if targetIdx == target: retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'} return JsonResponse(retJSON) continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm): searchType = 'region' region = searchTerm if queryDict.get("region"): region = queryDict.get("region") else: searchTerm = "" mo = re.match(r"(.*):(\d+)-(\d+)", region) (chrom, segmin, segmax) = mo.group(1, 2, 3) chrom = chrom.replace('chr', "") chrom = chrom.replace('CHR', "") if re.search("^rs[0-9]+", searchTerm.lower()): searchTerm = searchTerm.lower() addList.append(_find_snp_position(snpTrack, searchTerm)) if addList[0].get("error"): return JsonResponse({'error': addList[0]['error']}) position = addList[0]['end'] if searchType != 'region': searchType = 'snp' logger.warn("### "+searchType+" - "+searchTerm+' ###') if searchType == 'region': query_bool = BoolQuery() filter_bool = BoolQuery() if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)", queryDict.get("searchTerm").replace(",", "")) == None: query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]), Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) else: query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) if len(addList) > 0: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) else: filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax), RangeQuery("baitEnd", gte=segmin, lte=segmax)]), BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax), RangeQuery("oeEnd", gte=segmin, lte=segmax)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax) # @UnusedVariable if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) elif searchType == 'snp': if len(addList) > 0: chrom = addList[0]['chr'] query_bool = BoolQuery() query_bool.must([Query.term("baitChr", chrom), Query.term("oeChr", chrom), RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) filter_bool = BoolQuery() filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position), RangeQuery("baitEnd", gte=position)]), BoolQuery(must_arr=[RangeQuery("oeStart", lte=position), RangeQuery("oeEnd", gte=position)])]) query = ElasticQuery.filtered_bool(query_bool, filter_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) hic, segmin, segmax = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'} return JsonResponse(retJSON) else: # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"]) geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap())) resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search() if resultObj.hits_total > 1: geneResults = [] resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery, size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search() docs = resultObj2.docs gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids)) agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0}) res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg), size=0).search() ensg_count = res.aggs['ensg_agg'].get_buckets() gene_ids = [g['key'] for g in ensg_count] for d in resultObj2.docs: if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids: geneResults.append({ 'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''), 'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''), 'location': "chr" + getattr(d, "seqid") + ":" + locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." + locale.format_string("%d", getattr(d, "end"), grouping=True), }) if len(geneResults) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) elif len(geneResults) > 1: retJSON = { 'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.', 'results': geneResults, 'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location'] } return JsonResponse(retJSON) query_bool = BoolQuery() query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)]) query_bool = _add_tissue_filter(query_bool, targetIdx) query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]), query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx]) (hic, segmin, segmax) = _build_hic_query(query, targetIdx) if "error" in hic: return JsonResponse(hic) if len(hic) == 0: retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'} return JsonResponse(retJSON) chrom = hic[0]['baitChr'] try: chrom except NameError: retJSON = {'error': 'No chromosome defined for search'} return JsonResponse(retJSON) # get genes based on this segment genes = _build_gene_query(chrom, segmin, segmax) (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax) frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax) addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList) retJSON = {"hic": hic, "frags": frags, "meta": {"ostart": int(segmin), "oend": int(segmax), "rstart": 1, "rend": int(segmax) - int(segmin), "rchr": str(chrom), "tissues": utils.tissues['CP_TARGET_'+targetIdx]}, "snps": snps, "snp_meta": snpMeta, "genes": genes, "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax), "blueprint": blueprint, "extra": addList } response = JsonResponse(retJSON) return response