def test_missing_terms_filtered_query(self):
     ''' Test filtered query with a missing terms filter. '''
     terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name")
     query = ElasticQuery.filtered(Query.match_all(), terms_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
예제 #2
0
    def gene_mgi_parse(cls, gene_pubs, idx):
        ''' Parse Ensembl and MGI data from JAX. '''
        orthogenes_mgi = {}
        for gene_mgi in gene_pubs:
            parts = gene_mgi.split('\t')
            if 'MGI:' not in parts[0]:
                raise PipelineError('MGI not found '+parts[0])
            if 'ENSMUSG' not in parts[5]:
                raise PipelineError('ENSMUSG not found '+parts[5])
            orthogenes_mgi[parts[5]] = parts[0].replace('MGI:', '')

        orthogene_keys = list(orthogenes_mgi.keys())
        chunk_size = 450
        for i in range(0, len(orthogene_keys), chunk_size):
            chunk_gene_keys = orthogene_keys[i:i+chunk_size]
            json_data = ''
            query = ElasticQuery.filtered(Query.match_all(),
                                          TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl",
                                                                       chunk_gene_keys))
            docs = Search(query, idx=idx, size=chunk_size).search().docs
            for doc in docs:
                ens_id = doc.doc_id()
                idx_type = doc.type()
                mm = getattr(doc, 'dbxrefs')['orthologs']['mmusculus']
                mm['MGI'] = orthogenes_mgi[mm['ensembl']]
                dbxrefs = {"dbxrefs": {'orthologs': {"mmusculus": mm}}}
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': dbxrefs}) + '\n'

            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
예제 #3
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from elastic. '''
        q_size = view.paginator.get_limit(request)
        q_from = view.paginator.get_offset(request)

        filterable = getattr(view, 'filter_fields', [])
        filters = dict([(k, v) for k, v in request.GET.items()
                        if k in filterable])
        search_filters = self._build_filters(filters=filters)
        if search_filters is not None:
            q = ElasticQuery.filtered(Query.match_all(), search_filters)
        else:
            q = ElasticQuery(Query.match_all())
        s = Search(search_query=q,
                   idx=getattr(view, 'idx'),
                   size=q_size,
                   search_from=q_from)
        json_results = s.get_json_response()
        results = []
        for result in json_results['hits']['hits']:
            new_obj = ElasticObject(initial=result['_source'])
            new_obj.uuid = result['_id']
            results.append(new_obj)
        view.es_count = json_results['hits']['total']
        return results
예제 #4
0
    def _check_gene_history(cls, gene_sets, config):
        '''find a way to handle this better'''

        section = config['GENE_HISTORY']
        newgene_ids = {}
        discountinued_geneids = []

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            for doc in docs:
                geneid = getattr(doc, 'geneid')
                discontinued_geneid = getattr(doc, 'discontinued_geneid')
                if geneid is None:
                    discountinued_geneids.append(str(discontinued_geneid))
                else:
                    newgene_ids[str(discontinued_geneid)] = str(geneid)

        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets),
                                      sources=['geneid', 'discontinued_geneid'])
        ScanAndScroll.scan_and_scroll(section['index'], idx_type=section['index_type'],
                                      call_fun=process_hits, query=query)

        return (newgene_ids, discountinued_geneids)
예제 #5
0
    def _ensembl_entrez_lookup(cls, ensembl_gene_sets, section):
        ''' Get an ensembl:entrez id dictionary. '''
        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.ensembl", ensembl_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])

        docs = Search(equery, idx=section['index'], size=len(ensembl_gene_sets)).search().docs
        return {doc.doc_id(): getattr(doc, 'dbxrefs')['entrez'] for doc in docs}
예제 #6
0
        def check_hits(resp_json):
            rsids = {}
            docs = [Document(hit) for hit in resp_json['hits']['hits']]
            for doc in docs:
                rsid = getattr(doc, "id")
                if rsid is not None:
                    rsids[rsid] = doc
            rsids_keys = list(rsids.keys())
            terms_filter = TermsFilter.get_terms_filter("id", rsids_keys)
            query = ElasticQuery.filtered(Query.match_all(), terms_filter)
            elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys))
            docs_by_rsid = elastic.search().docs
            for doc in docs_by_rsid:
                info = getattr(doc, "info")
                if 'VC=SNV' not in info:
                    continue
                rsid = getattr(doc, "id")
                ic_doc = rsids[rsid]
                pos1 = getattr(doc, "start")
                pos2 = self._get_highest_build(ic_doc)['position']
                if abs(int(pos1) - int(pos2)) > 1:
                    is_par = getattr(ic_doc, 'is_par')
                    allele_a = getattr(ic_doc, 'allele_a')
                    if is_par is None and not (allele_a == 'D' or allele_a == 'I'):
                        msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') +
                               ' '+str(pos2)+" "+rsid+' '+str(pos1))
#                                ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')'

                        query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')),
                                                      Filter(Query.term("start", pos2)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")"

                        query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " (rshigh:"+str(getattr(d, "rshigh")) + \
                                       " build_id:"+str(getattr(d, "build_id"))+")"

                        logger.error(msg)
 def test_and_filtered_query(self):
     ''' Test building and running a filtered query. '''
     query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)])
     and_filter = AndFilter(query_bool)
     and_filter.extend(RangeQuery("start", gte=1)) \
               .extend(Query.term("seqid", 1))
     query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
예제 #8
0
 def test_url_rotate(self):
     ''' Test the url rotates from http://xxx:9200 to correct url. '''
     query = ElasticQuery.filtered(Query.term("seqid", 1),
                                   Filter(Query.term("id", "rs768019142")))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1,
                     "Elastic filtered query retrieved marker")
     Search.index_exists('test', 'test2')
     ElasticUrl.URL_INDEX = 0  # reset
예제 #9
0
    def _entrez_ensembl_lookup(cls, gene_sets, section, config=None):
        ''' Get an entrez:ensembl id dictionary. '''
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)
        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])

        docs = Search(equery, idx=section['index'], size=len(replaced_gene_sets)).search().docs
        return {getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}
 def test_or_filtered_query(self):
     ''' Test building and running a filtered query. '''
     highlight = Highlight(["id", "seqid"])
     query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1),
                                      RangeQuery("end", gte=100000)])
     or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000))
     or_filter.extend(query_bool) \
              .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap())
     query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
예제 #11
0
def _build_frags_query(frags_idx, chrom, segmin, segmax):

    query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                  Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                  utils.bedFields)
    fragsQuery = Search(search_query=query, search_from=0, size=2000000, idx=frags_idx)

    fragsResult = fragsQuery.get_result()
    frags = fragsResult['data']
    frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags)
    return frags
예제 #12
0
파일: views.py 프로젝트: D-I-L/pydgin
    def post(self, request, *args, **kwargs):
        ens_id = self.request.POST.get('ens_id')
        marker = self.request.POST.get('marker')
        markers = self.request.POST.getlist('markers[]')

        if ens_id:
            sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap())
        elif marker:
            sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap())
        elif markers:
            sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap())

        query = ElasticQuery.filtered(Query.match_all(), sfilter)
        elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500)
        study_hits = elastic.get_json_response()['hits']

        ens_ids = []
        pmids = []
        for hit in study_hits['hits']:
            if 'pmid' in hit['_source']:
                pmids.append(hit['_source']['pmid'])
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    ens_ids.append(ens_id)
        docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])
        pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal'])

        for hit in study_hits['hits']:
            genes = {}
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    try:
                        genes[ens_id] = getattr(docs[ens_id], 'symbol')
                    except KeyError:
                        genes = {ens_id: ens_id}
            hit['_source']['genes'] = genes
            if 'pmid' in hit['_source']:
                pmid = hit['_source']['pmid']
                try:
                    authors = getattr(pub_docs[pmid], 'authors')
                    journal = getattr(pub_docs[pmid], 'journal')
                    hit['_source']['pmid'] = \
                        {'pmid': pmid,
                         'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "",
                         'journal': journal}
                except KeyError:
                    hit['_source']['pmid'] = {'pmid': pmid}

        return JsonResponse(study_hits)
예제 #13
0
def _build_frags_query(frags_idx, chrom, segmin, segmax):

    query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                  Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                  utils.bedFields)
    fragsQuery = Search(search_query=query, search_from=0, size=10000, idx=frags_idx)

    # fragsResult = fragsQuery.get_result()
    # frags = fragsResult['data']
    fragsResult = fragsQuery.get_json_response()
    frags = []
    for hit in fragsResult['hits']['hits']:
        frags.append(hit['_source'])
    frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags)
    return frags
예제 #14
0
파일: views.py 프로젝트: D-I-L/django-chicp
def _build_frags_query(frags_idx, chrom, segmin, segmax):

    query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                  Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                  utils.bedFields)
    fragsQuery = Search(search_query=query, search_from=0, size=10000, idx=frags_idx)

    # fragsResult = fragsQuery.get_result()
    # frags = fragsResult['data']
    fragsResult = fragsQuery.get_json_response()
    frags = []
    for hit in fragsResult['hits']['hits']:
        frags.append(hit['_source'])
    frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags)
    return frags
예제 #15
0
    def _entrez_ensembl_lookup(cls, gene_sets, section, config=None):
        ''' Get an entrez:ensembl id dictionary. '''
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)
        lookup = {}

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            lookup.update({getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs})

        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])
        ScanAndScroll.scan_and_scroll(section['index'], call_fun=process_hits, query=equery)
        return lookup
예제 #16
0
    def _check_gene_history(cls, gene_sets, section):
        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets))
        docs = Search(query, idx=section['index'], idx_type=section['index_type_history'], size=1000000).search().docs

        newgene_ids = {}
        discountinued_geneids = []
        for doc in docs:
            geneid = getattr(doc, 'geneid')
            discontinued_geneid = getattr(doc, 'discontinued_geneid')

            if geneid is None:
                discountinued_geneids.append(str(discontinued_geneid))
            else:
                newgene_ids[str(discontinued_geneid)] = str(geneid)

        return (newgene_ids, discountinued_geneids)
예제 #17
0
 def range_overlap_query(cls,
                         seqid,
                         start_range,
                         end_range,
                         field_list=None,
                         seqid_param="seqid",
                         start_param="start",
                         end_param="end"):
     ''' Constructs a range overlap query '''
     query_bool = BoolQuery(must_arr=[
         RangeQuery(start_param, lte=start_range),
         RangeQuery(end_param, gte=end_range)
     ])
     or_filter = OrFilter(
         RangeQuery(start_param, gte=start_range, lte=end_range))
     or_filter.extend(RangeQuery(end_param, gte=start_range, lte=end_range)) \
              .extend(query_bool)
     return ElasticQuery.filtered(Query.term(seqid_param, seqid), or_filter,
                                  field_list)
예제 #18
0
    def _check_gene_history(cls, gene_sets, config):
        '''find a way to handle this better'''

        section = config['GENE_HISTORY']
        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets),
                                      sources=['geneid', 'discontinued_geneid'])
        docs = Search(query, idx=section['index'], idx_type=section['index_type'],
                      size=len(gene_sets)).search().docs

        newgene_ids = {}
        discountinued_geneids = []
        for doc in docs:
            geneid = getattr(doc, 'geneid')
            discontinued_geneid = getattr(doc, 'discontinued_geneid')
            if geneid is None:
                discountinued_geneids.append(str(discontinued_geneid))
            else:
                newgene_ids[str(discontinued_geneid)] = str(geneid)
        return (newgene_ids, discountinued_geneids)
예제 #19
0
    def _update_gene(cls, genes, idx):
        ''' Use genes data to update the index. '''
        gene_keys = list(genes.keys())
        chunk_size = 450
        for i in range(0, len(genes), chunk_size):
            chunk_gene_keys = gene_keys[i:i+chunk_size]
            json_data = ''

            query = ElasticQuery.filtered(Query.match_all(),
                                          TermsFilter.get_terms_filter("dbxrefs.entrez", chunk_gene_keys))
            docs = Search(query, idx=idx, size=chunk_size).search().docs
            for doc in docs:
                ens_id = doc._meta['_id']
                idx_type = doc.type()
                entrez = getattr(doc, 'dbxrefs')['entrez']
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': genes[entrez]}) + '\n'
            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
예제 #20
0
def studies_details(request):
    """ Get studies for a given ensembl ID. """
    ens_id = request.POST.get("ens_id")
    sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), sfilter)
    elastic = Search(query, idx=ElasticSettings.idx("REGION", "STUDY_HITS"), size=500)
    study_hits = elastic.get_json_response()["hits"]

    ens_ids = []
    pmids = []
    for hit in study_hits["hits"]:
        if "pmid" in hit["_source"]:
            pmids.append(hit["_source"]["pmid"])
        for ens_id in hit["_source"]["genes"]:
            ens_ids.append(ens_id)
    docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"])
    pub_docs = _get_pub_docs_by_pmid(pmids, sources=["authors.name", "journal"])

    for hit in study_hits["hits"]:
        genes = {}
        for ens_id in hit["_source"]["genes"]:
            try:
                genes[ens_id] = getattr(docs[ens_id], "symbol")
            except KeyError:
                genes = {ens_id: ens_id}
        hit["_source"]["genes"] = genes
        if "pmid" in hit["_source"]:
            pmid = hit["_source"]["pmid"]
            try:
                authors = getattr(pub_docs[pmid], "authors")
                journal = getattr(pub_docs[pmid], "journal")
                hit["_source"]["pmid"] = {
                    "pmid": pmid,
                    "author": authors[0]["name"].rsplit(None, 1)[-1],
                    "journal": journal,
                }
            except KeyError:
                hit["_source"]["pmid"] = {"pmid": pmid}

    return JsonResponse(study_hits)
예제 #21
0
def genesets_details(request):
    """ Get pathway gene sets for a given ensembl ID. """
    ens_id = request.POST.get("ens_id")
    geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), geneset_filter)
    elastic = Search(query, idx=ElasticSettings.idx("GENE", "PATHWAY"), size=500)
    genesets_hits = elastic.get_json_response()["hits"]
    ens_ids = []
    for hit in genesets_hits["hits"]:
        for ens_id in hit["_source"]["gene_sets"]:
            ens_ids.append(ens_id)
    docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"])

    for hit in genesets_hits["hits"]:
        genesets = {}
        for ens_id in hit["_source"]["gene_sets"]:
            try:
                genesets[ens_id] = getattr(docs[ens_id], "symbol")
            except KeyError:
                genesets[ens_id] = ens_id
        hit["_source"]["gene_sets"] = genesets
    return JsonResponse(genesets_hits)
예제 #22
0
    def get_new_pmids(cls, pmids, idx, disease_code=None):
        ''' Find PMIDs in a list that are not in the elastic index. '''
        chunk_size = 800
        pmids_found = set()
        pmids_found_add = pmids_found.add
        time.sleep(5)

        for i in range(0, len(pmids), chunk_size):
            pmids_slice = pmids[i:i+chunk_size]
            terms_filter = TermsFilter.get_terms_filter("pmid", pmids_slice)
            query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=['pmid', 'tags'])

            docs = Search(query, idx=idx, size=chunk_size).search().docs
            json_data = ''

            for doc in docs:
                pmids_found_add(getattr(doc, 'pmid'))
                if disease_code is not None:
                    tags = getattr(doc, 'tags')
                    if 'disease' in tags:
                        disease = tags['disease']
                    else:
                        disease = []
                    if disease_code not in disease:
                        # update disease attribute
                        disease.append(disease_code)
                        tags['disease'] = disease
                        idx_name = doc._meta['_index']
                        idx_type = doc.type()

                        doc_data = {"update": {"_id": doc._meta['_id'], "_type": idx_type,
                                               "_index": idx_name, "_retry_on_conflict": 3}}
                        json_data += json.dumps(doc_data) + '\n'
                        json_data += json.dumps({'doc': {'tags': tags}}) + '\n'

            if json_data != '':
                Loader().bulk_load(idx_name, idx_type, json_data)

        return [pmid for pmid in pmids if pmid not in pmids_found]
예제 #23
0
파일: views.py 프로젝트: D-I-L/pydgin
def genesets_details(request):
    ''' Get pathway gene sets for a given ensembl ID. '''
    ens_id = request.POST.get('ens_id')
    geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), geneset_filter)
    elastic = Search(query, idx=ElasticSettings.idx('GENE', 'PATHWAY'), size=500)
    genesets_hits = elastic.get_json_response()['hits']
    ens_ids = []
    for hit in genesets_hits['hits']:
        for ens_id in hit['_source']['gene_sets']:
            ens_ids.append(ens_id)
    docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])

    for hit in genesets_hits['hits']:
        genesets = {}
        for ens_id in hit['_source']['gene_sets']:
            try:
                genesets[ens_id] = getattr(docs[ens_id], 'symbol')
            except KeyError:
                genesets[ens_id] = ens_id
        hit['_source']['gene_sets'] = genesets
    return JsonResponse(genesets_hits)
예제 #24
0
    def _convert_entrezid2ensembl(cls, gene_sets, section, log_output_file_handler=None, log_conversion=True):
        '''Converts given set of entrez ids to ensembl ids by querying the gene index dbxrefs'''

        # first check in gene_history
        (newgene_ids, discontinued_ids) = cls._check_gene_history(gene_sets, section)

        # replace all old ids with new ids
        replaced_gene_sets = cls._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)

        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets))
        docs = Search(query, idx=section['index'], size=1000000).search().docs
        ensembl_ids = []
        for doc in docs:
            ens_id = doc._meta['_id']
            ensembl_ids.append(ens_id)

        if log_conversion:
            if log_output_file_handler is not None:
                cls._log_entrezid2ensembl_coversion(replaced_gene_sets, ensembl_ids, log_output_file_handler)

        return ensembl_ids
예제 #25
0
def _build_snp_query(snp_track, chrom, segmin, segmax):
    snps = []
    snpMeta = {}
    maxScore = -1
    if snp_track and snp_track != 'None':
        # get SNPs based on this segment
        mo = re.match(r"(.*)-(.*)", snp_track)
        (group, track) = mo.group(1, 2)
        snp_track_idx = getattr(chicp_settings, 'CHICP_IDX').get(group).get('INDEX')
        snp_track_type = ''
        if getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS').get(snp_track):
            snp_track_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS') \
                .get(snp_track).get('TYPE')
        else:
            snp_track_type = track

        query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                      Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                      utils.snpFields)
        snpQuery = Search(search_query=query, search_from=0, size=2000000, idx=snp_track_idx+'/'+snp_track_type)

        snpResult = snpQuery.get_result()
        snps = snpResult['data']
        snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps)

        data_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('DATA_TYPE')
        snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type)
#        if 'max' in snpSettings:
#            maxScore = float(snpSettings['max'])
#        else:
        for s in snps:
            if float(s['score']) > maxScore:
                maxScore = float(s['score'])
        snpSettings['max'] = maxScore

        snpMeta = snpSettings

    return snps, snpMeta
예제 #26
0
        def check_hits(resp_json):
            self.assertTrue('hits' in resp_json, 'scan and scroll hits')
            self.assertGreaterEqual(len(resp_json['hits']['hits']), 1)
            docs = [Document(hit) for hit in resp_json['hits']['hits']]
            for doc1 in docs:
                doc_internal_id = getattr(doc1, "internal_id")
                if doc_internal_id in internal_id:
                    pos1 = self._get_highest_build(doc1)
                    for doc2 in internal_id[doc_internal_id]:
                        pos2 = self._get_highest_build(doc2)
                        if pos2['position'] != pos1['position']:
                            msg = ("DIFFERENT POSITIONS ID: "+str(doc_internal_id)+":\t" +
                                   str(getattr(doc1, "name"))+": "+pos1['position']+" ("+doc1.doc_id()+")\t" +
                                   str(getattr(doc2, "name"))+": "+pos2['position']+" ("+doc2.doc_id()+")\t")
                            try:
                                terms_filter = TermsFilter.get_terms_filter("start", [pos1['position'],
                                                                                      pos2['position']])
                                query = ElasticQuery.filtered(Query.term("seqid", pos1['seqid']), terms_filter)
                                elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'))
                                docs_by_pos = elastic.search().docs
                                found = False
                                for d in docs_by_pos:
                                    msg += getattr(d, "id")+": "+str(getattr(d, "start"))+"\t"
                                    if getattr(d, "id") == 'rs'+str(doc_internal_id):
                                        found = True

                                if not found:
                                    msg += 'rs'+str(doc_internal_id)
                                    if self._rs_exists('rs'+str(doc_internal_id)):
                                        msg += ' EXISTS IN DBSNP\t'
                                    else:
                                        msg += ' NOT IN DBSNP\t'
                                logger.error(msg)
                            except KeyError:
                                logger.error(msg)
                    internal_id[doc_internal_id].append(doc1)
                else:
                    internal_id[doc_internal_id] = [doc1]
예제 #27
0
def _build_snp_query(snp_track, chrom, segmin, segmax):
    snps = []
    snpMeta = {}
    maxScore = -1
    if snp_track and snp_track != 'None':
        # get SNPs based on this segment
        mo = re.match(r"(.*)-(.*)", snp_track)
        (group, track) = mo.group(1, 2)
        try:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper())
        except SettingsError:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track

        query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                      Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                      utils.snpFields)
        snpQuery = Search(search_query=query, search_from=0, size=10000, idx=snp_track_idx)

        # snpResult = snpQuery.get_result()
        # snps = snpResult['data']
        snpResult = snpQuery.get_json_response()
        snps = []
        for hit in snpResult['hits']['hits']:
            snps.append(hit['_source'])
        snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps)

        data_type = ElasticSettings.get_label('CP_STATS_'+group.upper(), None, "data_type")
        snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type)

        for s in snps:
            if float(s['score']) > maxScore:
                maxScore = float(s['score'])
        snpSettings['max'] = maxScore

        snpMeta = snpSettings

    return snps, snpMeta
예제 #28
0
파일: views.py 프로젝트: D-I-L/django-chicp
def _build_snp_query(snp_track, chrom, segmin, segmax):
    snps = []
    snpMeta = {}
    maxScore = -1
    if snp_track and snp_track != 'None':
        # get SNPs based on this segment
        mo = re.match(r"(.*)-(.*)", snp_track)
        (group, track) = mo.group(1, 2)
        try:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper())
        except SettingsError:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track

        query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                      Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                      utils.snpFields)
        snpQuery = Search(search_query=query, search_from=0, size=10000, idx=snp_track_idx)

        # snpResult = snpQuery.get_result()
        # snps = snpResult['data']
        snpResult = snpQuery.get_json_response()
        snps = []
        for hit in snpResult['hits']['hits']:
            snps.append(hit['_source'])
        snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps)

        data_type = ElasticSettings.get_label('CP_STATS_'+group.upper(), None, "data_type")
        snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type)

        for s in snps:
            if float(s['score']) > maxScore:
                maxScore = float(s['score'])
        snpSettings['max'] = maxScore

        snpMeta = snpSettings

    return snps, snpMeta
예제 #29
0
    def filter_queryset(self, request, queryset, view):
        ''' Get disease regions. '''
        try:
            filterable = getattr(view, 'filter_fields', [])
            filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])
            dis = filters.get('disease', 'T1D')
            show_genes = filters.get('genes', False)
            show_markers = filters.get('markers', False)
            show_regions = filters.get('regions', True)

            build = self._get_build(filters.get('build', settings.DEFAULT_BUILD))
            docs = DiseaseLocusDocument.get_disease_loci_docs(dis)
            if len(docs) == 0:
                messages.error(request, 'No regions found for '+dis+'.')

            visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')])
            regions = []
            all_markers = []
            all_genes = []
            ens_all_cand_genes = []
            for r in docs:
                region = r.get_disease_region(visible_hits, build=build)
                if region is not None:
                    ens_all_cand_genes.extend(region['ens_cand_genes'])
                    all_markers.extend(region['markers'])
                    region['hits'] = [self._study_hit_obj(s, region) for s in
                                      StudyHitDocument.process_hits(r.hit_docs, region['all_diseases'])]

                    (all_coding, all_non_coding) = views.get_genes_for_region(getattr(r, "seqid"),
                                                                              region['rstart']-500000,
                                                                              region['rstop']+500000)
                    (region_coding, coding_up, coding_down) = views._region_up_down(all_coding, region['rstart'],
                                                                                    region['rstop'])
                    (region_non_coding, non_coding_up, non_coding_down) = \
                        views._region_up_down(all_non_coding, region['rstart'], region['rstop'])
                    region['genes'] = {
                        'upstream': {'coding': [g.doc_id() for g in coding_up],
                                     'non_coding': [g.doc_id() for g in non_coding_up]},
                        'region': {'coding': [g.doc_id() for g in region_coding],
                                   'non_coding': [g.doc_id() for g in region_non_coding]},
                        'downstream': {'coding': [g.doc_id() for g in coding_down],
                                       'non_coding': [g.doc_id() for g in non_coding_down]},
                    }
                    all_genes.extend(region['genes']['region']['coding'])
                    all_genes.extend(region['genes']['region']['non_coding'])
                    regions.append(region)

            # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits
            stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers),
                                                Filter(RangeQuery("p_value", lte=5E-08)))
            stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs
            meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping',
                                                   is_post=False)
            # get ensembl to gene symbol mapping for all candidate genes
            extra_markers = []
            for region in regions:
                # add diseases from IC/GWAS stats
                (study_ids, region['marker_stats']) = views._process_stats(stats_docs, region['markers'], meta_response)
                region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']])

                other_hits_query = ElasticQuery(
                        BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])],
                                  must_not_arr=[Query.terms("dil_study_id", study_ids)]))
                other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'),
                                    size=100).search()
                region['extra_markers'] = [self._study_hit_obj(s, region) for s in
                                           StudyHitDocument.process_hits(other_hits.docs, region['all_diseases'])]
                region['all_diseases'] = list(set(region['all_diseases']))
                extra_markers.extend([m['marker_id'] for m in region['extra_markers']])

            # get markers
            marker_objs = []
            if show_markers:
                query = ElasticQuery(Query.terms("id", all_markers), sources=['id', 'start'])
                marker_docs = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'),
                                     size=len(all_markers)).search().docs
                mids = {getattr(m, 'id'): getattr(m, 'start') for m in marker_docs}
                marker_objs = [h for r in regions for h in r['hits']]
                marker_objs.extend([h for r in regions for h in r['extra_markers']])
                for m in marker_objs:
                    m['start'] = mids[m['marker_id']]

            # get genes
            gene_objs = []
            if show_genes:
                all_genes.extend(ens_all_cand_genes)
                gene_docs = GeneDocument.get_genes(all_genes, sources=['start', 'stop', 'chromosome',
                                                                       'symbol', 'biotype'])
                for doc in Document.sorted_alphanum(gene_docs, 'chromosome'):
                    ensembl_id = doc.doc_id()
                    region_name = ''
                    candidate_gene = 0
                    for region in regions:
                        if ('genes' in region and
                            (ensembl_id in region['genes']['region']['coding'] or
                             ensembl_id in region['genes']['region']['non_coding'] or
                             ensembl_id in region['ens_cand_genes'])):
                            region_name = region['region_name']
                            candidate_gene = 1 if ensembl_id in region['ens_cand_genes'] else 0
                            break
                    gene_objs.append({
                        'ensembl_id': ensembl_id,
                        'seqid': 'chr'+getattr(doc, 'chromosome'),
                        'start': getattr(doc, 'start'),
                        'end': getattr(doc, 'stop'),
                        'symbol': getattr(doc, 'symbol'),
                        'biotype': getattr(doc, 'biotype'),
                        'region_name': region_name,
                        'candidate_gene': candidate_gene
                    })
            if show_regions == 'false':
                regions = []
            regions.extend(gene_objs)
            regions.extend(marker_objs)
            return regions
        except (TypeError, ValueError, IndexError, ConnectionError) as e:
            print(e)
            raise Http404
    def test_elastic_group_name(self):
        '''
        Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/
        Testing various elastic queries

        idx doc:
         "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"},
         "seqid": "chr4", "source": "immunobase", "type": "region",
         "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373}
        idx_query:
        Private(in given group) OR Public
        -d '{"query":{"filtered":{"filter":{"bool": {
                                            "should": [
                                                        {"terms": {"group_name":["dil"]}},
                                                        { "missing": { "field": "group_name"   }}
                                                      ]
                                                    }}}}}'
        Private(in given group):
        -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}'
        Public:
        -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}},
-                         'query': {'term': {'match_all': '{}'}}}}}
        '''
        # get the groups for the given user
        response = self.client.post('/accounts/login/', {'username': '******', 'password': '******'})
        self.assertTrue(response.status_code, "200")

        logged_in_user = User.objects.get(id=self.client.session['_auth_user_id'])
        if logged_in_user and logged_in_user.is_authenticated():
            user_groups = get_user_groups(logged_in_user)
            self.assertTrue('READ' in user_groups, "user present in READ group")
            # make sure the user is not yet in DIL group
            self.assertFalse('DIL' in user_groups, "user not present in DIL group")

        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names : group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) == 0, "No group present")

        # Match all query, as there is no group we do a match all
        query = ElasticQuery(Query.match_all())
        expected_query_string = {"query": {"match_all": {}}}
        self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched")

        Search.index_refresh(self.index_name)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 12, "Elastic string query retrieved all public regions")

        # Filtered query for group names, add the user to DIL group and get the query string
        self.dil_group = Group.objects.create(name='DIL')
        logged_in_user.groups.add(self.dil_group)
        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names : group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) > 0, "More than 1 group present")
        self.assertTrue("dil" in group_names, "DIL group present")

        # retrieves all docs with missing field group_name - 11 docs
        terms_filter = TermsFilter.get_missing_terms_filter("field", "attr.group_name")
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 11, "Elastic string query retrieved all public regions")

        # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs
        query_bool = BoolQuery()
        query_bool.should(Query.missing_terms("field", "group_name")) \
                  .should(Query.terms("group_name", group_names).query_wrap())

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 12, "Elastic string query retrieved both public + private regions")

        terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names)
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 1, "Elastic string query retrieved one private regions")
        self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region")
        self.assertEqual(docs[0].attr['region_id'], "803", "type matched region")
        self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")
 def test_not_filtered_query(self):
     ''' Test building and running a filtered query. '''
     not_filter = NotFilter(RangeQuery("start", lte=10000))
     query = ElasticQuery.filtered(Query.term("seqid", 1), not_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
예제 #32
0
파일: views.py 프로젝트: D-I-L/pydgin
    def get_regions(cls, request, dis, context):
        # is_authenticated = False
        elastic_url = ElasticSettings.url()

        (core, other) = Disease.get_site_diseases(dis_list=dis.upper().split(','))
        if len(core) == 0 and len(other) == 0:
            messages.error(request, 'Disease '+dis+' not found.')
            raise Http404()

        disease = core[0] if len(core) > 0 else other[0]
        context['title'] = getattr(disease, "name")+" Regions"

        docs = DiseaseLocusDocument.get_disease_loci_docs(dis)
        if len(docs) == 0:
            messages.error(request, 'No regions found for '+dis+'.')
            raise Http404()

        visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')])
        meta_response = Search.elastic_request(elastic_url, ElasticSettings.idx("IC_STATS") + '/_mapping',
                                               is_post=False)
        regions = []
        ens_all_cand_genes = []
        all_markers = []
        for r in docs:
            region = r.get_disease_region(visible_hits)
            if region is not None:
                ens_all_cand_genes.extend(region['ens_cand_genes'])
                all_markers.extend(region['markers'])
                region['hits'] = StudyHitDocument.process_hits(r.hit_docs, region['all_diseases'])

                (all_coding, all_non_coding) = get_genes_for_region(getattr(r, "seqid"),
                                                                    region['rstart']-500000, region['rstop']+500000)
                (region_coding, coding_up, coding_down) = _region_up_down(all_coding, region['rstart'], region['rstop'])
                (region_non_coding, non_coding_up, non_coding_down) = \
                    _region_up_down(all_non_coding, region['rstart'], region['rstop'])
                region['genes'] = {
                    'upstream': {'coding': coding_up, 'non_coding': non_coding_up},
                    'region': {'coding': region_coding, 'non_coding': region_non_coding},
                    'downstream': {'coding': coding_down, 'non_coding': non_coding_down},
                }
                regions.append(region)

        # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits
        stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers),
                                            Filter(RangeQuery("p_value", lte=5E-08)))
        stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs

        # get ensembl to gene symbol mapping for all candidate genes
        all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes)
        for region in regions:
            region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)}
            (study_ids, region['marker_stats']) = _process_stats(stats_docs, region['markers'], meta_response)

            # add diseases from IC/GWAS stats
            region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']])

            other_hits_query = ElasticQuery(
                        BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])],
                                  must_not_arr=[Query.terms("dil_study_id", study_ids)]))
            other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=100).search()
            region['extra_markers'] = StudyHitDocument.process_hits(other_hits.docs, region['all_diseases'])

        context['regions'] = regions
        context['disease_code'] = [dis]
        context['disease'] = getattr(disease, "name")
        return context
 def test_term_filtered_query(self):
     ''' Test filtered query with a term filter. '''
     query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142")))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker")
 def test_terms_filtered_query(self):
     ''' Test filtered query with a terms filter. '''
     terms_filter = TermsFilter.get_terms_filter("id", ["rs2476601", "rs768019142"])
     query = ElasticQuery.filtered(Query.term("seqid", 1), terms_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
 def test_type_filtered_query(self):
     ''' Test filtered query with a type filter. '''
     type_filter = Filter(Query.query_type_for_filter("marker"))
     query = ElasticQuery.filtered(Query.term("seqid", 1), type_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
예제 #36
0
파일: views.py 프로젝트: D-I-L/django-chicp
def chicpeaSearch(request, url):
    queryDict = request.GET
    user = request.user
    targetIdx = queryDict.get("targetIdx")
    blueprint = {}
    hic = []
    addList = []
    searchType = 'gene'
    searchTerm = queryDict.get("searchTerm").upper()
    searchTerm = searchTerm.replace(",", "")
    searchTerm = searchTerm.replace("..", "-")
    searchTerm = searchTerm.replace(" ", "") # Chris suggestion to prevent issue with spaces in queries
    snpTrack = queryDict.get("snp_track")

    (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types(
                                            user=user, idx_keys=None, idx_type_keys=None)

    if snpTrack:
        mo = re.match(r"(.*)-(.*)", snpTrack)
        (group, track) = mo.group(1, 2)  # @UnusedVariable
        if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth:
            snpTrack = None

    if targetIdx not in utils.tissues:
        for target in getattr(chicp_settings, 'CP_TARGET'):
            if 'CP_TARGET_'+target not in idx_keys_auth:
                if targetIdx == target:
                    retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'}
                    return JsonResponse(retJSON)
                continue
            elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target")
            tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)]
                              ['mappings']['gene_target']['_meta']['tissue_type'].keys())
            utils.tissues['CP_TARGET_'+target] = tissueList

    if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm):
        searchType = 'region'
        region = searchTerm
        if queryDict.get("region"):
            region = queryDict.get("region")
        else:
            searchTerm = ""
        mo = re.match(r"(.*):(\d+)-(\d+)", region)
        (chrom, segmin, segmax) = mo.group(1, 2, 3)
        chrom = chrom.replace('chr', "")
        chrom = chrom.replace('CHR', "")
    if re.search("^rs[0-9]+", searchTerm.lower()):
        searchTerm = searchTerm.lower()
        addList.append(_find_snp_position(snpTrack, searchTerm))
        if addList[0].get("error"):
            return JsonResponse({'error': addList[0]['error']})
        position = addList[0]['end']
        if searchType != 'region':
            searchType = 'snp'

    logger.warn("### "+searchType+" - "+searchTerm+' ###')

    if searchType == 'region':
        query_bool = BoolQuery()
        filter_bool = BoolQuery()
        if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)",
                                                         queryDict.get("searchTerm").replace(",", "")) == None:
            query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]),
                             Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
        else:
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])

        query_bool = _add_tissue_filter(query_bool, targetIdx)

        if len(addList) > 0:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])
        else:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax),
                                                    RangeQuery("baitEnd", gte=segmin, lte=segmax)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax),
                                                    RangeQuery("oeEnd", gte=segmin, lte=segmax)])])

        query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                           sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
        (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax)  # @UnusedVariable

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'}
            return JsonResponse(retJSON)

    elif searchType == 'snp':
        if len(addList) > 0:
            chrom = addList[0]['chr']

            query_bool = BoolQuery()
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
            query_bool = _add_tissue_filter(query_bool, targetIdx)

            filter_bool = BoolQuery()
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])

            query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                               sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
            hic, segmin, segmax = _build_hic_query(query, targetIdx)

            if "error" in hic:
                return JsonResponse(hic)
            if len(hic) == 0:
                retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'}
                return JsonResponse(retJSON)
    else:
        # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"])
        geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap()))
        resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/',
                           search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search()
        if resultObj.hits_total > 1:
            geneResults = []
            resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery,
                                size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search()

            docs = resultObj2.docs
            gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs]

            query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids))
            agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0})
            res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg),
                         size=0).search()

            ensg_count = res.aggs['ensg_agg'].get_buckets()
            gene_ids = [g['key'] for g in ensg_count]

            for d in resultObj2.docs:
                if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids:
                    geneResults.append({
                        'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''),
                        'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''),
                        'location': "chr" + getattr(d, "seqid") + ":" +
                        locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." +
                        locale.format_string("%d", getattr(d, "end"), grouping=True),
                    })

            if len(geneResults) == 0:
                retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
                return JsonResponse(retJSON)
            elif len(geneResults) > 1:
                retJSON = {
                    'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.',
                    'results': geneResults,
                    'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location']
                }
                return JsonResponse(retJSON)

        query_bool = BoolQuery()
        query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)])
        query_bool = _add_tissue_filter(query_bool, targetIdx)
        query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]),
                                           query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])

        (hic, segmin, segmax) = _build_hic_query(query, targetIdx)

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
            return JsonResponse(retJSON)
        chrom = hic[0]['baitChr']

    try:
        chrom
    except NameError:
        retJSON = {'error': 'No chromosome defined for search'}
        return JsonResponse(retJSON)

    # get genes based on this segment
    genes = _build_gene_query(chrom, segmin, segmax)
    (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax)
    frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax)

    addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList)

    retJSON = {"hic": hic,
               "frags": frags,
               "meta": {"ostart": int(segmin),
                        "oend": int(segmax),
                        "rstart": 1,
                        "rend": int(segmax) - int(segmin),
                        "rchr": str(chrom),
                        "tissues": utils.tissues['CP_TARGET_'+targetIdx]},
               "snps": snps,
               "snp_meta": snpMeta,
               "genes": genes,
               "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax),
               "blueprint": blueprint,
               "extra": addList
               }

    response = JsonResponse(retJSON)
    return response
    def test_gene_pipeline(self):
        """ Test gene pipeline. """

        INI_CONFIG = IniParser().read_ini(MY_INI_FILE)
        idx = INI_CONFIG["ENSEMBL_GENE_GTF"]["index"]
        idx_type = INI_CONFIG["ENSEMBL_GENE_GTF"]["index_type"]

        """ 1. Test ensembl GTF loading. """
        call_command(
            "pipeline", "--steps", "stage", "load", sections="ENSEMBL_GENE_GTF", dir=TEST_DATA_DIR, ini=MY_INI_FILE
        )
        Search.index_refresh(idx)

        elastic = Search(idx=idx, idx_type=idx_type)
        self.assertGreaterEqual(elastic.get_count()["count"], 1, "Count documents in the index")
        map1_props = Gene.gene_mapping(idx, idx_type, test_mode=True).mapping_properties
        map2_props = elastic.get_mapping()
        if idx not in map2_props:
            logger.error("MAPPING ERROR: " + json.dumps(map2_props))
        self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)

        """ 2. Test adding entrez ID to documents """
        call_command("pipeline", "--steps", "load", sections="GENE2ENSEMBL", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertEqual(len(docs), 1)
        self.assertTrue("entrez" in getattr(docs[0], "dbxrefs"))
        self.assertEqual(getattr(docs[0], "dbxrefs")["entrez"], "26191")

        """ 3. Add uniprot and fill in missing entrez fields. """
        call_command(
            "pipeline", "--steps", "download", "load", sections="ENSMART_GENE", dir=TEST_DATA_DIR, ini=MY_INI_FILE
        )
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("DNMT3L", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertTrue("entrez" in getattr(docs[0], "dbxrefs"))
        self.assertTrue("swissprot" in getattr(docs[0], "dbxrefs"))

        """ 4. Add gene synonyms and dbxrefs. """
        call_command("pipeline", "--steps", "load", sections="GENE_INFO", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertTrue("PTPN8" in getattr(docs[0], "synonyms"))

        """ 5. Add PMIDs to gene docs. """
        call_command("pipeline", "--steps", "load", sections="GENE_PUBS", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertGreater(len(getattr(docs[0], "pmids")), 0)

        """ 6. Add ortholog data. """
        call_command("pipeline", "--steps", "load", sections="ENSMART_HOMOLOG", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        dbxrefs = getattr(docs[0], "dbxrefs")
        self.assertTrue("orthologs" in dbxrefs, dbxrefs)
        self.assertTrue("mmusculus" in dbxrefs["orthologs"], dbxrefs)
        self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"])

        query = ElasticQuery.filtered(
            Query.match_all(),
            TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", ["ENSMUSG00000027843"]),
        )
        docs = Search(query, idx=idx, size=1).search().docs
        self.assertEqual(len(docs), 1)

        """ 7. Add mouse ortholog link to MGI """
        call_command("pipeline", "--steps", "load", sections="ENSEMBL2MGI", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        docs = Search(query, idx=idx, size=1).search().docs
        dbxrefs = getattr(docs[0], "dbxrefs")
        self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"])
        self.assertEqual("107170", dbxrefs["orthologs"]["mmusculus"]["MGI"])
예제 #38
0
    def test_elastic_group_name(self):
        '''
        Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/
        Testing various elastic queries

        idx doc:
         "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"},
         "seqid": "chr4", "source": "immunobase", "type": "region",
         "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373}
        idx_query:
        Private(in given group) OR Public
        -d '{"query":{"filtered":{"filter":{"bool": {
                                            "should": [
                                                        {"terms": {"group_name":["dil"]}},
                                                        { "missing": { "field": "group_name"   }}
                                                      ]
                                                    }}}}}'
        Private(in given group):
        -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}'
        Public:
        -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}},
-                         'query': {'term': {'match_all': '{}'}}}}}
        '''
        # get the groups for the given user
        response = self.client.post('/accounts/login/', {
            'username': '******',
            'password': '******'
        })
        self.assertTrue(response.status_code, "200")

        logged_in_user = User.objects.get(
            id=self.client.session['_auth_user_id'])
        if logged_in_user and logged_in_user.is_authenticated():
            user_groups = get_user_groups(logged_in_user)
            self.assertTrue('READ' in user_groups,
                            "user present in READ group")
            # make sure the user is not yet in DIL group
            self.assertFalse('DIL' in user_groups,
                             "user not present in DIL group")

        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names: group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) == 0, "No group present")

        # Match all query, as there is no group we do a match all
        query = ElasticQuery(Query.match_all())
        expected_query_string = {"query": {"match_all": {}}}
        self.assertJSONEqual(json.dumps(query.query),
                             json.dumps(expected_query_string),
                             "Query string matched")

        Search.index_refresh(self.index_name)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 12,
            "Elastic string query retrieved all public regions")

        # Filtered query for group names, add the user to DIL group and get the query string
        self.dil_group = Group.objects.create(name='DIL')
        logged_in_user.groups.add(self.dil_group)
        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names: group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) > 0, "More than 1 group present")
        self.assertTrue("dil" in group_names, "DIL group present")

        # retrieves all docs with missing field group_name - 11 docs
        terms_filter = TermsFilter.get_missing_terms_filter(
            "field", "attr.group_name")
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 11,
            "Elastic string query retrieved all public regions")

        # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs
        query_bool = BoolQuery()
        query_bool.should(Query.missing_terms("field", "group_name")) \
                  .should(Query.terms("group_name", group_names).query_wrap())

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 12,
            "Elastic string query retrieved both public + private regions")

        terms_filter = TermsFilter.get_terms_filter("attr.group_name",
                                                    group_names)
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 1,
            "Elastic string query retrieved one private regions")
        self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region")
        self.assertEqual(docs[0].attr['region_id'], "803",
                         "type matched region")
        self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]",
                         "type matched region")
 def test_exists_filtered_query(self):
     ''' Test building and running a filtered query. '''
     exists_filter = ExistsFilter("start")
     query = ElasticQuery.filtered(Query.term("seqid", 1), exists_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
예제 #40
0
파일: views.py 프로젝트: D-I-L/pydgin
    def get_disease(cls, request, disease, context):
        disease = disease.lower()
        if disease is None:
            messages.error(request, 'No disease given.')
            raise Http404()
        query = ElasticQuery(Query.terms("code", [disease.split(',')]))
        elastic = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE'), size=5)
        res = elastic.search()
        if res.hits_total == 0:
            messages.error(request, 'Disease(s) '+disease+' not found.')
        elif res.hits_total < 9:
            disease_docs = res.docs
            names = ', '.join([getattr(doc, 'name') for doc in disease_docs])

            meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping',
                                                   is_post=False)
            elastic_meta = json.loads(meta_response.content.decode("utf-8"))
            disease_docs = res.docs
            for dis in disease_docs:
                dis_code = getattr(dis, 'code').upper()
                docs = DiseaseLocusDocument.get_disease_loci_docs(dis_code)
                regions = []
                ens_all_cand_genes = []
                all_markers = []
                for r in docs:
                    region = r.get_disease_region()
                    if region is not None:
                        regions.append(region)
                        ens_all_cand_genes.extend(region['ens_cand_genes'])
                        all_markers.extend(region['markers'])

                # get ensembl to gene symbol mapping for all candidate genes
                all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes)
                for region in regions:
                    region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)}
                setattr(dis, 'regions', regions)

                # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits
                stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers),
                                                    Filter(RangeQuery("p_value", lte=5E-08)), sources=['marker'])
                stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"),
                                    size=len(all_markers)).search().docs

                other_hits_query = ElasticQuery(
                        BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", all_markers)]),
                        sources=['marker', 'disease'])
                other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'),
                                    size=5000).search().docs

                for region in regions:
                    diseases = [dis_code]
                    for doc in stats_docs:
                        if getattr(doc, 'marker') in region['markers']:
                            meta_info = elastic_meta[doc.index()]['mappings'][doc.type()]['_meta']
                            if meta_info['disease'] not in diseases:
                                diseases.append(meta_info['disease'])

                    for doc in other_hits:
                        if getattr(doc, 'marker') in region['markers']:
                            if doc.disease is not None and doc.disease not in diseases:
                                diseases.append(doc.disease)
                    region['diseases'] = diseases

                studies = StudyDocument.get_studies(disease_code=dis_code)
                for doc in studies:
                    setattr(doc, 'study_id', getattr(doc, 'study_id').replace('GDXHsS00', ''))
                    pmid = getattr(doc, 'principal_paper')
                    pubs = PublicationDocument.get_publications(pmid, sources=['date', 'authors.name', 'journal'])
                    if len(pubs) > 0:
                        authors = getattr(pubs[0], 'authors')
                        setattr(doc, 'date', getattr(pubs[0], 'date'))
                        setattr(doc, 'journal', getattr(pubs[0], 'journal'))
                        setattr(doc, 'author', authors[0]['name'].rsplit(None, 1)[-1] if authors else "")
                setattr(dis, 'studies',  studies)

            context['features'] = disease_docs
            context['title'] = names
            return context
        raise Http404()
        else:
            features[qid] = src['score']
    update_count += len(hits)
    print(str(update_count)+" of "+str(resp_json['hits']['total']))

scan_n_sroll = ScanAndScroll.scan_and_scroll(criteria_idx, call_fun=process_hits, idx_type=criteria_idx_type)

# Update suggest weights
update_count = 0
chunk_size = 2000
feature_names = list(features.keys())

for i in range(0, len(feature_names), chunk_size):
    feature_names_slice = feature_names[i:i+chunk_size]
    terms_filter = TermsFilter.get_terms_filter(feature_id, feature_names_slice)
    query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=[feature_id, 'suggest'])
    docs = Search(query, idx=idx, idx_type=idx_type, size=chunk_size).search().docs

    json_data = ''
    for doc in docs:
        doc_id = doc.doc_id()
        if feature_id.startswith('dbxrefs'):
            obj_id = getattr(doc, 'dbxrefs')['ensembl']
        else:
            obj_id = getattr(doc, feature_id)
        doc_data = {"update": {"_id": doc_id, "_type": idx_type,
                               "_index": idx, "_retry_on_conflict": 3}}
        json_data += json.dumps(doc_data) + '\n'

        suggest = getattr(doc, 'suggest')
        if suggest is None:
예제 #42
0
def chicpeaSearch(request, url):
    queryDict = request.GET
    user = request.user
    targetIdx = queryDict.get("targetIdx")
    blueprint = {}
    hic = []
    addList = []
    searchType = 'gene'
    searchTerm = queryDict.get("searchTerm").upper()
    searchTerm = searchTerm.replace(",", "")
    searchTerm = searchTerm.replace("..", "-")
    snpTrack = queryDict.get("snp_track")

    (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types(
                                            user=user, idx_keys=None, idx_type_keys=None)

    if snpTrack:
        mo = re.match(r"(.*)-(.*)", snpTrack)
        (group, track) = mo.group(1, 2)  # @UnusedVariable
        if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth:
            snpTrack = None

    if targetIdx not in utils.tissues:
        for target in getattr(chicp_settings, 'CP_TARGET'):
            if 'CP_TARGET_'+target not in idx_keys_auth:
                if targetIdx == target:
                    retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'}
                    return JsonResponse(retJSON)
                continue
            elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target")
            tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)]
                              ['mappings']['gene_target']['_meta']['tissue_type'].keys())
            utils.tissues['CP_TARGET_'+target] = tissueList

    if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm):
        searchType = 'region'
        region = searchTerm
        if queryDict.get("region"):
            region = queryDict.get("region")
        else:
            searchTerm = ""
        mo = re.match(r"(.*):(\d+)-(\d+)", region)
        (chrom, segmin, segmax) = mo.group(1, 2, 3)
        chrom = chrom.replace('chr', "")
        chrom = chrom.replace('CHR', "")
    if re.search("^rs[0-9]+", searchTerm.lower()):
        searchTerm = searchTerm.lower()
        addList.append(_find_snp_position(snpTrack, searchTerm))
        if addList[0].get("error"):
            return JsonResponse({'error': addList[0]['error']})
        position = addList[0]['end']
        if searchType != 'region':
            searchType = 'snp'

    logger.warn("### "+searchType+" - "+searchTerm+' ###')

    if searchType == 'region':
        query_bool = BoolQuery()
        filter_bool = BoolQuery()
        if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)",
                                                         queryDict.get("searchTerm").replace(",", "")) == None:
            query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]),
                             Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
        else:
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])

        query_bool = _add_tissue_filter(query_bool, targetIdx)

        if len(addList) > 0:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])
        else:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax),
                                                    RangeQuery("baitEnd", gte=segmin, lte=segmax)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax),
                                                    RangeQuery("oeEnd", gte=segmin, lte=segmax)])])

        query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                           sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
        (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax)  # @UnusedVariable

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'}
            return JsonResponse(retJSON)

    elif searchType == 'snp':
        if len(addList) > 0:
            chrom = addList[0]['chr']

            query_bool = BoolQuery()
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
            query_bool = _add_tissue_filter(query_bool, targetIdx)

            filter_bool = BoolQuery()
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])

            query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                               sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
            hic, segmin, segmax = _build_hic_query(query, targetIdx)

            if "error" in hic:
                return JsonResponse(hic)
            if len(hic) == 0:
                retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'}
                return JsonResponse(retJSON)
    else:
        # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"])
        geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap()))
        resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/',
                           search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search()
        if resultObj.hits_total > 1:
            geneResults = []
            resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery,
                                size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search()

            docs = resultObj2.docs
            gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs]

            query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids))
            agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0})
            res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg),
                         size=0).search()

            ensg_count = res.aggs['ensg_agg'].get_buckets()
            gene_ids = [g['key'] for g in ensg_count]

            for d in resultObj2.docs:
                if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids:
                    geneResults.append({
                        'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''),
                        'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''),
                        'location': "chr" + getattr(d, "seqid") + ":" +
                        locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." +
                        locale.format_string("%d", getattr(d, "end"), grouping=True),
                    })

            if len(geneResults) == 0:
                retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
                return JsonResponse(retJSON)
            elif len(geneResults) > 1:
                retJSON = {
                    'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.',
                    'results': geneResults,
                    'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location']
                }
                return JsonResponse(retJSON)

        query_bool = BoolQuery()
        query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)])
        query_bool = _add_tissue_filter(query_bool, targetIdx)
        query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]),
                                           query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])

        (hic, segmin, segmax) = _build_hic_query(query, targetIdx)

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
            return JsonResponse(retJSON)
        chrom = hic[0]['baitChr']

    try:
        chrom
    except NameError:
        retJSON = {'error': 'No chromosome defined for search'}
        return JsonResponse(retJSON)

    # get genes based on this segment
    genes = _build_gene_query(chrom, segmin, segmax)
    (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax)
    frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax)

    addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList)

    retJSON = {"hic": hic,
               "frags": frags,
               "meta": {"ostart": int(segmin),
                        "oend": int(segmax),
                        "rstart": 1,
                        "rend": int(segmax) - int(segmin),
                        "rchr": str(chrom),
                        "tissues": utils.tissues['CP_TARGET_'+targetIdx]},
               "snps": snps,
               "snp_meta": snpMeta,
               "genes": genes,
               "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax),
               "blueprint": blueprint,
               "extra": addList
               }

    response = JsonResponse(retJSON)
    return response