def test_sort_query(self):
     ''' Test sorting for a query. '''
     query = ElasticQuery(Query.match_all())
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score'))
     self._check_sort_order(elastic.search().docs)
     qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]})
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort)
     self._check_sort_order(elastic.search().docs)
     self.assertRaises(QueryError, Sort, 1)
    def test_term_query(self):
        ''' Test building and running a match query. '''
        query = ElasticQuery(Query.term("id", "rs2476601"))
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)")

        query = ElasticQuery(Query.term("seqid", "1", boost=3.0))
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers  on chr1")
 def test_query_ids(self):
     ''' Test by query ids. '''
     query = ElasticQuery(Query.ids(['1', '2']))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5)
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)")
     idx_type = docs[0].type()
     query = ElasticQuery(Query.ids('2', types=idx_type))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5)
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
    def test_term(self):
        ''' Terms Aggregation '''
        agg_name = "test"
        agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0})
        aggs = Aggs(agg)
        search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))
        r_aggs = search.search().aggs
        self.assertTrue(agg_name in r_aggs, "returned test aggregations")

        ''' Ids Query with Terms Aggregation'''
        query = ElasticQuery(Query.ids(['1', '2']))
        search = Search(search_query=query, aggs=aggs, idx=ElasticSettings.idx('DEFAULT'), size=5)
        r_aggs = search.search().aggs
        self.assertTrue(len(r_aggs[agg_name].get_buckets()) > 0, "returned test aggregation buckets")
        self.assertTrue(getattr(r_aggs[agg_name], 'buckets')[0]['doc_count'] >= 0, "bucket document count")
Пример #5
0
 def get_gene_docs_by_ensembl_id(cls, ens_ids, sources=None):
     ''' Get the gene symbols for the corresponding array of ensembl IDs.
     A dictionary is returned with the key being the ensembl ID and the
     value the gene document. '''
     query = ElasticQuery(Query.ids(ens_ids), sources=sources)
     elastic = Search(query, idx=ElasticSettings.idx('GENE', idx_type='GENE'), size=len(ens_ids))
     return {doc.doc_id(): doc for doc in elastic.search().docs}
    def test_pubs_disease_tags(self):
        ''' Check the number of disease publications against the number of tags.disease and
        report differences`. '''
        count = True
        msg = ''
        for disease in DiseasePublicationTest.DISEASES:
            pmids = self._get_pmids(disease)
            disease_code = disease.lower()
            elastic = Search(search_query=ElasticQuery(BoolQuery(
                         b_filter=Filter(Query.term('tags.disease', disease_code))), sources=['pmid']),
                         idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)
            res = elastic.get_count()
            msg += disease_code+'\tINDEX: '+str(res['count'])+'\tNCBI: '+str(len(pmids))
            if res['count'] != len(pmids):
                count = False
                docs = elastic.search().docs
                pmids_in_idx = [getattr(doc, 'pmid') for doc in docs]
                pmids_diff1 = [pmid for pmid in pmids_in_idx if pmid not in pmids]
                pmids_diff2 = [pmid for pmid in pmids if pmid not in pmids_in_idx]
                if len(pmids_diff1) > 0:
                    msg += '\textra PMIDs: '+str(pmids_diff1)
                if len(pmids_diff2) > 0:
                    msg += '\tmissing PMIDs: '+str(pmids_diff2)
            msg += '\n'

        print(msg)
        self.assertTrue(count, 'Count for disease tags')
 def test_significant_terms(self):
     ''' Significant Terms Aggregation '''
     agg = Agg("test_significant_terms", "significant_terms", {"field": "start"})
     aggs = Aggs(agg)
     search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))
     r_aggs = search.search().aggs
     self.assertTrue('test_significant_terms' in r_aggs, "returned aggregations")
Пример #8
0
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    seqid = str(seqid).replace('chr', '')
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", lte=start_pos),
            RangeQuery("stop", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", gte=start_pos),
            RangeQuery("stop", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
 def test_string_query(self):
     ''' Test building and running a string query. '''
     query = ElasticQuery.query_string("rs2476601", fields=["id"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search()
     self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)")
     self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
    def test_filter(self):
        ''' Filter Aggregation '''
        agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='25000')),
               Agg('avg_start', 'avg', {"field": 'start'}),
               Agg('min_start', 'min', {"field": 'start'}),
               Agg('sum_start', 'sum', {"field": 'start'}),
               Agg('stats_start', 'stats', {"field": 'start'}),
               Agg('count_start', 'value_count', {"field": 'start'}),
               Agg('ext_stats_start', 'extended_stats', {"field": 'start'})]
        aggs = Aggs(agg)
        search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))

        r_aggs = search.search().aggs
        self.assertTrue('avg_start' in r_aggs, "returned avg aggregation")
        self.assertTrue('min_start' in r_aggs, "returned min aggregation")

        stats_keys = ["min", "max", "sum", "count", "avg"]
        self.assertTrue(all(hasattr(r_aggs['stats_start'], k)
                            for k in stats_keys),
                        "returned min aggregation")

        stats_keys.extend(["sum_of_squares", "variance", "std_deviation", "std_deviation_bounds"])
        self.assertTrue(all(hasattr(r_aggs['ext_stats_start'], k)
                            for k in stats_keys),
                        "returned min aggregation")
Пример #11
0
def _get_pub_docs_by_pmid(pmids, sources=None):
    """ Get the gene symbols for the corresponding array of ensembl IDs.
    A dictionary is returned with the key being the ensembl ID and the
    value the gene document. """
    query = ElasticQuery(Query.ids(pmids), sources=sources)
    elastic = Search(query, idx=ElasticSettings.idx("PUBLICATION"), size=len(pmids))
    return {doc.doc_id(): doc for doc in elastic.search().docs}
 def test_missing_terms_filtered_query(self):
     ''' Test filtered query with a missing terms filter. '''
     terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name")
     query = ElasticQuery.filtered(Query.match_all(), terms_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
Пример #13
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from Rserve. '''
        try:
            filterable = getattr(view, 'filter_fields', [])
            filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])

            mid1 = filters.get('marker', 'rs2476601')
            dataset = filters.get('dataset', 'EUR').replace('-', '')
            query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start'])
            elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1)
            doc = elastic.search().docs[0]
            seqid = getattr(doc, 'seqid')

            rserve = getattr(settings, 'RSERVE')
            conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT'))
            pop_str = conn.r.get_pop(dataset, seqid, mid1)

            pops = json.loads(str(pop_str))
            populations = []
            for pop in pops:
                pops[pop]['population'] = pop
                populations.append(pops[pop])
            conn.close()
            return [ElasticObject(initial={'populations': populations, 'marker': mid1})]
        except (TypeError, ValueError, IndexError, ConnectionError):
            return [ElasticObject(initial={'populations': None, 'marker': mid1})]
Пример #14
0
    def get_disease_tags(cls, feature_id, idx=None, idx_type=None):
        ''' function to get the aggregated list of disease_tags for a given feature id, aggregated
            from all criteria_types for a feature type
        @type  feature_id: string
        @keyword feature_id: Id of the feature (gene => gene_id, region=>region_id)
              @type  idx: string
        @param idx: name of the index
        @type  idx_type: string
        @param idx_type: name of the idx type, each criteria is an index type
        '''
        query = ElasticQuery(Query.term("qid", feature_id))
        agg = Agg("criteria_disease_tags", "terms", {"field": "disease_tags", "size": 0})
        aggs = Aggs(agg)

        if idx_type:
            search = Search(query, aggs=aggs, idx=idx, idx_type=idx_type)
        else:
            search = Search(query, aggs=aggs, idx=idx)

        disease_tags = []
        try:
            r_aggs = search.search().aggs
            buckets = r_aggs['criteria_disease_tags'].get_buckets()
            disease_tags = [dis_dict['key'].lower() for dis_dict in buckets]
        except:
            return []

        # get disease docs
        if (len(disease_tags) > 0):
            (core, other) = Disease.get_site_diseases(dis_list=disease_tags)
            diseases = list(core)
            diseases.extend(other)
            return diseases
        else:
            return None
Пример #15
0
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    if seqid is not None and isinstance(seqid,
                                        str) and seqid.startswith("chr"):
        seqid = seqid
    else:
        seqid = 'chr' + str(seqid)
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("gene_symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", lte=start_pos),
            RangeQuery("featureloc.end", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", gte=start_pos),
            RangeQuery("featureloc.end", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
 def test_top_hits(self):
     ''' Top Hits Aggregation '''
     agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='2000')),
            Agg('test_top_hits', 'top_hits', {"size": 1})]
     aggs = Aggs(agg)
     search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))
     hits = search.search().aggs['test_top_hits'].get_hits()
     self.assertTrue(len(hits) == 1, "returned the top hit")
 def test_top_hits_sub_agg(self):
     sub_agg = Agg('idx_top_hits', 'top_hits', {"size": 1})
     aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg),
                  Agg("categories", "terms", {"field": "_type", "size": 0})])
     search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))
     buckets = search.search().aggs['idxs'].get_docs_in_buckets()
     self.assertEqual(buckets[ElasticSettings.idx('DEFAULT')]['doc_count'], 3)
     self.assertEqual(len(buckets[ElasticSettings.idx('DEFAULT')]['docs']), 1)
 def test_missing(self):
     ''' Missing Aggregation '''
     agg = Agg("test_missing", "missing", {"field": "seqid"})
     aggs = Aggs(agg)
     search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))
     r_aggs = search.search().aggs
     self.assertTrue(getattr(r_aggs['test_missing'], 'doc_count') == 0,
                     "no missing seqid fields")
Пример #19
0
        def check_hits(resp_json):
            rsids = {}
            docs = [Document(hit) for hit in resp_json['hits']['hits']]
            for doc in docs:
                rsid = getattr(doc, "id")
                if rsid is not None:
                    rsids[rsid] = doc
            rsids_keys = list(rsids.keys())
            terms_filter = TermsFilter.get_terms_filter("id", rsids_keys)
            query = ElasticQuery.filtered(Query.match_all(), terms_filter)
            elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys))
            docs_by_rsid = elastic.search().docs
            for doc in docs_by_rsid:
                info = getattr(doc, "info")
                if 'VC=SNV' not in info:
                    continue
                rsid = getattr(doc, "id")
                ic_doc = rsids[rsid]
                pos1 = getattr(doc, "start")
                pos2 = self._get_highest_build(ic_doc)['position']
                if abs(int(pos1) - int(pos2)) > 1:
                    is_par = getattr(ic_doc, 'is_par')
                    allele_a = getattr(ic_doc, 'allele_a')
                    if is_par is None and not (allele_a == 'D' or allele_a == 'I'):
                        msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') +
                               ' '+str(pos2)+" "+rsid+' '+str(pos1))
#                                ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')'

                        query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')),
                                                      Filter(Query.term("start", pos2)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")"

                        query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " (rshigh:"+str(getattr(d, "rshigh")) + \
                                       " build_id:"+str(getattr(d, "build_id"))+")"

                        logger.error(msg)
 def test_and_filtered_query(self):
     ''' Test building and running a filtered query. '''
     query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)])
     and_filter = AndFilter(query_bool)
     and_filter.extend(RangeQuery("start", gte=1)) \
               .extend(Query.term("seqid", 1))
     query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
 def test_bool_filtered_query(self):
     ''' Test building and running a filtered boolean query. '''
     query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)],
                            should_arr=[RangeQuery("start", gte=10050)])
     query_bool.must([Query.term("id", "rs768019142")]) \
               .should(RangeQuery("start", gte=10054))
     query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
Пример #22
0
    def fetch_overlapping_features(cls, build, seqid, start, end, idx=None, idx_type=None, disease_id=None):
        ''' function to create fetch overlapping features for a given stretch of region
            the build info is stored as nested document..so nested query is build
        @type  build: string
        @param build: build info eg: 'GRCh38'
        @type  seqid: string
        @param seqid: chromosome number
        @type  start:  string
        @param start: region start
        @type  end:  string
        @param end: region end
        @type  idx: string
        @param idx: name of the index
        @type  idx_type: string
        @param idx_type: name of the idx type, each criteria is an index type
        @type  disease_id:  string
        @param disease_id: disease code
        '''
        nbuild = build
        start_range = start
        end_range = end

        bool_range = BoolQuery()
        bool_range.must(RangeQuery("build_info.start", lte=start_range)) \
                  .must(RangeQuery("build_info.end", gte=end_range))

        or_filter = OrFilter(RangeQuery("build_info.start", gte=start_range, lte=end_range))

        or_filter.extend(RangeQuery("build_info.end", gte=start_range, lte=end_range)) \
                 .extend(bool_range)

        bool_query = BoolQuery()

        if disease_id:
            qnested_buildinfo = Query.nested('build_info', bool_query)
            bool_query = BoolQuery()
            bool_query.must(Query.term("disease", disease_id.lower())).must(qnested_buildinfo)
            qnested = ElasticQuery(bool_query, sources=['build_info.*',
                                                        'disease_locus',
                                                        'disease',
                                                        'chr_band',
                                                        'species'])

        else:
            bool_query.must(Query.term("build_info.build", nbuild)) \
                  .must(Query.term("build_info.seqid", seqid)) \
                  .filter(or_filter)

            qnested = ElasticQuery(Query.nested('build_info', bool_query), sources=['build_info.*',
                                                                                    'disease_locus',
                                                                                    'disease',
                                                                                    'chr_band',
                                                                                    'species'])

        elastic = Search(qnested, idx=idx, idx_type=idx_type)
        res = elastic.search()
        return res.docs
Пример #23
0
 def test_url_rotate(self):
     ''' Test the url rotates from http://xxx:9200 to correct url. '''
     query = ElasticQuery.filtered(Query.term("seqid", 1),
                                   Filter(Query.term("id", "rs768019142")))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1,
                     "Elastic filtered query retrieved marker")
     Search.index_exists('test', 'test2')
     ElasticUrl.URL_INDEX = 0  # reset
 def test_terms_avg_order(self):
     ''' Test average and order. '''
     agg_name = "test"
     sub_agg = Agg('avg_start', 'avg', {"field": "start"})
     agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0, "order": {"avg_start": "desc"}}, sub_agg=sub_agg)
     search = Search(aggs=Aggs(agg), idx=ElasticSettings.idx('DEFAULT'))
     r_aggs = search.search().aggs
     self.assertTrue(agg_name in r_aggs, "returned test aggregations")
     self.assertGreater(r_aggs['test'].get_buckets()[0]['doc_count'], 1)
 def test_bool_filtered_query2(self):
     ''' Test building and running a filtered boolean query. '''
     query_bool = BoolQuery()
     query_bool.should(RangeQuery("start", lte=20000)) \
               .should(Query.term("seqid", 2)) \
               .must(Query.term("seqid", 1))
     query_string = Query.query_string("rs768019142", fields=["id", "seqid"])
     query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
Пример #26
0
    def is_region_for_disease(cls, hit, section=None, config=None, result_container={}):

        result_container_populated = result_container
        feature_doc = hit['_source']
        feature_doc['_id'] = hit['_id']
        disease_loci = feature_doc['disease_loci']
        region_id = feature_doc['region_id']

        diseases = set()
        for disease_locus_id in disease_loci:

            query = ElasticQuery(Query.ids([disease_locus_id]), sources=['hits'])
            elastic = Search(query, idx=ElasticSettings.idx('REGION', idx_type='DISEASE_LOCUS'))
            disease_locus_hits = elastic.search().docs

            for disease_locus_hit in disease_locus_hits:
                hits = getattr(disease_locus_hit, 'hits')
                for hit in hits:
                    query = ElasticQuery(Query.ids([hit]))
                    elastic = Search(query, idx=ElasticSettings.idx('REGION', idx_type='STUDY_HITS'))
                    hit_doc = elastic.search().docs[0]

                    disease = getattr(hit_doc, "disease")
                    status = getattr(hit_doc, "status")

                    if status != 'N':
                        return result_container

                    disease_loci = getattr(hit_doc, "disease_locus").lower()

                    if disease_loci == 'tbc':
                        return result_container

                    diseases.add(disease)

        for disease in diseases:

            result_container_populated = cls.populate_container(disease,
                                                                disease,
                                                                fnotes=None, features=[region_id],
                                                                diseases=[disease],
                                                                result_container=result_container_populated)
        return result_container_populated
 def test_filters(self):
     ''' Filters Aggregation '''
     filters = {'filters': {'start_gt': RangeQuery('start', gt='1000'),
                            'start_lt': RangeQuery('start', lt='100000')}}
     agg = Agg('test_filters', 'filters', filters)
     aggs = Aggs(agg)
     search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))
     r_aggs = search.search().aggs
     self.assertTrue('start_lt' in r_aggs['test_filters'].get_buckets(),
                     "returned avg aggregation")
 def test_terms_query(self):
     ''' Test building and running a match query. '''
     highlight = Highlight(["id"])
     query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 2,
                     "Elastic string query retrieved markers (rs2476601, rs768019142)")
     self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found")
     self.assertTrue(docs[0].highlight() is not None, "highlighting found")
 def test_range(self):
     ''' Range Aggregation '''
     agg = Agg("test_range_agg", "range",
               {"field": "start",
                "ranges": [{"to": 10000},
                           {"from": 10000, "to": 15000}]})
     aggs = Aggs(agg)
     search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT'))
     r_aggs = search.search().aggs
     self.assertTrue(len(r_aggs['test_range_agg'].get_buckets()) == 2,
                     "returned two buckets in range aggregations")
 def test_or_filtered_query(self):
     ''' Test building and running a filtered query. '''
     highlight = Highlight(["id", "seqid"])
     query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1),
                                      RangeQuery("end", gte=100000)])
     or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000))
     or_filter.extend(query_bool) \
              .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap())
     query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
Пример #31
0
    def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1):
        ''' Get a random doc from the indices. '''
        score_function1 = ScoreFunction.create_score_function('random_score', seed=random.randint(0, 1000000))

        search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'),
                                    sources=sources)
        elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type)
        try:
            return elastic.search().docs
        except IndexError:
            return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
    def test_bool_filtered_query4(self):
        ''' Test building and running a filtered boolean query.
        Note: ElasticQuery used to wrap match in a query object. '''
        query_bool = BoolQuery()
        query_bool.should(RangeQuery("start", lte=20000)) \
                  .should(Query.term("seqid", 2)) \
                  .must(Query.match("id", "rs768019142").query_wrap()) \
                  .must(Query.term("seqid", 1))

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
Пример #33
0
 def test_pub_ini_file2(self):
     ''' Test publication pipeline with a list of PMIDs. '''
     out = StringIO()
     call_command('publications', '--dir', TEST_DATA_DIR, '--steps', 'load',
                  sections='DISEASE::TEST', ini=MY_PUB_INI_FILE, stdout=out)
     INI_CONFIG = IniParser().read_ini(MY_PUB_INI_FILE)
     idx = INI_CONFIG['DISEASE']['index']
     Search.index_refresh(idx)
     query = ElasticQuery.query_string("test", fields=["tags.disease"])
     elastic = Search(query, idx=idx)
     docs = elastic.search().docs
     self.assertGreater(len(docs), 1)
    def test_bool_nested_filter(self):
        ''' Test combined Bool filter '''
        query_bool_nest = BoolQuery()
        query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \
                       .must(Query.term("seqid", 1))

        query_bool = BoolQuery()
        query_bool.should(query_bool_nest) \
                  .should(Query.term("seqid", 2))
        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
 def test_bool_query(self):
     ''' Test a bool query. '''
     query_bool = BoolQuery()
     highlight = Highlight(["id", "seqid"])
     query_bool.must(Query.term("id", "rs768019142")) \
               .must(RangeQuery("start", gt=1000)) \
               .must_not(Query.match("seqid", "2")) \
               .should(Query.match("seqid", "3")) \
               .should(Query.match("seqid", "1"))
     query = ElasticQuery.bool(query_bool, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
    def test_mapping_parent_child(self):
        ''' Test creating mapping with parent child relationship. '''
        gene_mapping = MappingProperties("gene")
        gene_mapping.add_property("symbol", "string", analyzer="full_name")
        inta_mapping = MappingProperties("publication", "gene")
        load = Loader()
        idx = "test__mapping__"+SEARCH_SUFFIX
        options = {"indexName": idx, "shards": 1}
        requests.delete(ElasticSettings.url() + '/' + idx)

        # add child mappings first
        status = load.mapping(inta_mapping, "publication", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping inteactions")
        status = load.mapping(gene_mapping, "gene", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping genes")

        ''' load docs and test has parent query'''
        json_data = '{"index": {"_index": "%s", "_type": "gene", "_id" : "1"}}\n' % idx
        json_data += json.dumps({"symbol": "PAX1"}) + '\n'
        json_data += '{"index": {"_index": "%s", "_type": "publication", "_id" : "2", "parent": "1"}}\n' % idx
        json_data += json.dumps({"pubmed": 1234}) + '\n'
        Bulk.load(idx, '', json_data)
        Search.index_refresh(idx)
        query = ElasticQuery.has_parent('gene', Query.match('symbol', 'PAX1'))
        elastic = Search(query, idx=idx, idx_type='publication', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'pubmed'), 1234)
        self.assertEquals(docs[0].parent(), '1')
        self.assertRaises(QueryError, ElasticQuery.has_parent, 'gene', 'xxxxx')

        ''' test has child query '''
        query = ElasticQuery.has_child('publication', Query.match('pubmed', 1234))
        elastic = Search(query, idx=idx, idx_type='gene', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'symbol'), 'PAX1')
        self.assertEquals(docs[0].parent(), None)
        requests.delete(ElasticSettings.url() + '/' + idx)
Пример #37
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request feature locations. '''
        try:
            filterable = getattr(view, 'filter_fields', [])
            filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])
            query_str = filters.get('feature', 'PTPN22')
            build = self._get_build(filters.get('build', settings.DEFAULT_BUILD))
            if query_str is None or query_str == '':
                return [ElasticObject(initial={'error': 'No feature name provided.'})]

            search_fields = ['id',
                             'symbol', 'dbxrefs.ensembl',
                             'region_name']
            sources = ['start', 'stop', 'seqid', 'chromosome',
                       'disease_loci']
            idxs = ElasticSettings.getattr('IDX')
            MARKER_IDX = ''

            if build == ElasticSettings.get_label('MARKER', label='build'):
                MARKER_IDX = 'MARKER'
            if MARKER_IDX == '':
                for idx in idxs:
                    if 'MARKER' in idx:
                        if build == ElasticSettings.get_label(idx, label='build'):
                            MARKER_IDX = idx

            (idx, idx_type) = ElasticSettings.idx_names(MARKER_IDX, 'MARKER')
            (idx_r, idx_type_r) = ElasticSettings.idx_names('REGION', 'REGION')
            (idx_g, idx_type_g) = ElasticSettings.idx_names('GENE', 'GENE')
            idx += ',' + idx_r + ',' + idx_g
            idx_type += ',' + idx_type_r + ',' + idx_type_g

            equery = BoolQuery(must_arr=Query.query_string(query_str, fields=search_fields))
            elastic = Search(search_query=ElasticQuery(equery, sources), size=10, idx=idx, idx_type=idx_type)
            docs = elastic.search().docs
            locs = []
            for doc in docs:
                if isinstance(doc, RegionDocument):
                    doc = Region.pad_region_doc(doc)

                loc = doc.get_position(build=build).split(':')
                pos = loc[1].replace(',', '').split('-')
                locs.append(ElasticObject(
                    {'feature': query_str,
                     'chr': loc[0],
                     'start': int(pos[0]),
                     'end': int(pos[1]) if len(pos) > 1 else int(pos[0]),
                     'locusString': query_str+" ("+str(loc[1])+")"}))
            return locs
        except (TypeError, ValueError, IndexError, ConnectionError):
            raise Http404
Пример #38
0
def study_page(request, study):
    ''' Renders a study page. '''
    if study is None:
        messages.error(request, 'No study id given.')
        raise Http404()
    query = ElasticQuery(Query.ids(study.split(',')))
    elastic = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=5)
    res = elastic.search(obj_document=StudyDocument)
    if res.hits_total == 0:
        messages.error(request, 'Study(s) '+study+' not found.')
    elif res.hits_total < 9:
        names = ', '.join([getattr(doc, 'study_name') for doc in res.docs])
        context = {'features': res.docs, 'title': names}
        return render(request, 'study/study.html', context, content_type='text/html')
    raise Http404()
Пример #39
0
    def filter_queryset(self, request, queryset, view):
        """ Override this method to request just the documents required from Rserve. """
        try:
            filterable = getattr(view, "filter_fields", [])
            filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])

            mid1 = filters.get("m1")
            if mid1 is None or mid1 == "":
                return [ElasticObject(initial={"error": "No marker ID provided."})]

            dataset = filters.get("dataset", "EUR").replace("-", "")
            mid2 = filters.get("m2")
            window_size = int(filters.get("window_size", 1000000))
            dprime = filters.get("dprime", 0.0)
            rsq = filters.get("rsq", 0.8)
            maf = filters.get("maf", False)
            if maf:
                maf = True
            build_version = filters.get("build", "GRCh38").lower()
            pos = filters.get("pos", False)
            if pos:
                pos = True

            query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=["seqid", "start"])
            elastic = Search(search_query=query, idx=ElasticSettings.idx("MARKER", "MARKER"), size=1)
            doc = elastic.search().docs[0]
            seqid = getattr(doc, "seqid")

            rserve = getattr(settings, "RSERVE")
            conn = pyRserve.connect(host=rserve.get("HOST"), port=rserve.get("PORT"))
            ld_str = conn.r.ld_run(
                dataset,
                seqid,
                mid1,
                marker2=mid2,
                window_size=window_size,
                dprime=dprime,
                rsq=rsq,
                maf=maf,
                position=pos,
                build_version=build_version,
            )
            ld_str = ld_str.replace("D.prime", "dprime").replace("R.squared", "rsquared")
            conn.close()

            return [ElasticObject(initial=json.loads(str(ld_str)))]
        except (TypeError, ValueError, IndexError, ConnectionError):
            raise Http404
Пример #40
0
def marker_page(request):
    ''' Renders a gene page. '''
    query_dict = request.GET
    marker = query_dict.get("m")
    if marker is None:
        messages.error(request, 'No gene name given.')
        raise Http404()

    fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name']
    sub_agg = Agg('top_hits', 'top_hits', {"size": 15})
    aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg))
    query = ElasticQuery(Query.query_string(marker, fields=fields))
    elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0)
    res = elastic.search()
    if res.hits_total >= 1:
        types = getattr(res.aggs['types'], 'buckets')
        marker_doc = None
        ic_docs = []
        history_docs = []
        for doc_type in types:
            hits = doc_type['top_hits']['hits']['hits']
            for hit in hits:
                doc = Document(hit)
                if 'marker' == doc_type['key']:
                    marker_doc = doc
                elif 'immunochip' == doc_type['key']:
                    ic_docs.append(doc)
                elif 'rs_merge' == doc_type['key']:
                    history_docs.append(doc)

        criteria = {}
        if marker_doc is not None:
            if ElasticSettings.idx('CRITERIA') is not None:
                criteria = views.get_criteria([marker_doc], 'marker', 'id', 'MARKER')
            marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER'))

        context = {
            'marker': marker_doc,
            'old_dbsnp_docs': _get_old_dbsnps(marker),
            'ic': ic_docs,
            'history': history_docs,
            'criteria': criteria
        }
        return render(request, 'marker/marker.html', context,
                      content_type='text/html')
    elif res.hits_total == 0:
        messages.error(request, 'Marker '+marker+' not found.')
        raise Http404()
Пример #41
0
def _get_old_dbsnps(marker):
    ''' Get markers from old versions of DBSNP. Assumes the index key is
    prefixed by 'MARKER_'. '''
    old_dbsnps_names = sorted([ElasticSettings.idx(k) for k in ElasticSettings.getattr('IDX').keys()
                               if 'MARKER_' in k], reverse=True)
    old_dbsnp_docs = []
    if len(old_dbsnps_names) > 0:
        search_query = ElasticQuery(Query.query_string(marker, fields=['id', 'rscurrent']))
        for idx_name in old_dbsnps_names:
            elastic2 = Search(search_query=search_query, idx=idx_name, idx_type='marker')
            docs = elastic2.search().docs
            if len(docs) > 0:
                old_doc = docs[0]
                old_doc.marker_build = _get_marker_build(idx_name)
                old_dbsnp_docs.append(old_doc)
    return old_dbsnp_docs
Пример #42
0
def disease_page(request, disease):
    ''' Renders a disease page. '''
    disease = disease.lower()
    if disease is None:
        messages.error(request, 'No disease given.')
        raise Http404()
    query = ElasticQuery(Query.terms("code", [disease.split(',')]))
    elastic = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE'), size=5)
    res = elastic.search()
    if res.hits_total == 0:
        messages.error(request, 'Disease(s) '+disease+' not found.')
    elif res.hits_total < 9:
        names = ', '.join([getattr(doc, 'name') for doc in res.docs])
        context = {'features': res.docs, 'title': names}
        return render(request, 'disease/index.html', context, content_type='text/html')
    raise Http404()
Пример #43
0
def region_page(request, region):
    ''' Renders a region page. '''
    if region is None:
        messages.error(request, 'No region given.')
        raise Http404()
    query = ElasticQuery(Query.ids(region.split(',')))
    elastic = Search(query, idx=ElasticSettings.idx('REGION', 'REGION'), size=5)
    res = elastic.search()
    if res.hits_total == 0:
        messages.error(request, 'Region(s) '+region+' not found.')
    elif res.hits_total < 9:
        names = ', '.join([getattr(doc, 'region_name') for doc in res.docs])
        REGIONS = [Region.pad_region_doc(doc) for doc in res.docs]
        context = {'features': REGIONS, 'title': names}
        return render(request, 'region/index.html', context, content_type='text/html')
    raise Http404()
Пример #44
0
    def get_marker(cls, request, marker, context):
        if marker is None:
            messages.error(request, 'No marker name given.')
            raise Http404()

        fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name']
        sub_agg = Agg('top_hits', 'top_hits', {"size": 15})
        aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg))
        query = ElasticQuery(Query.query_string(marker, fields=fields))
        elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0)
        res = elastic.search()
        title = ''
        if res.hits_total >= 1:
            types = getattr(res.aggs['types'], 'buckets')
            marker_doc = None
            ic_docs = []
            history_docs = []
            for doc_type in types:
                hits = doc_type['top_hits']['hits']['hits']
                for hit in hits:
                    doc = PydginDocument.factory(hit)
                    if doc.get_name() is not None:
                        title = doc.get_name()

                    if 'marker' == doc_type['key']:
                        marker_doc = doc
                    elif 'immunochip' == doc_type['key']:
                        ic_docs.append(doc)
                    elif 'rs_merge' == doc_type['key']:
                        history_docs.append(doc)

            if marker_doc is not None:
                marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER'))

            criteria_disease_tags = MarkerView.criteria_disease_tags(request, [marker])
            context['criteria'] = criteria_disease_tags
            context['features'] = [marker_doc]
            context['old_dbsnp_docs'] = _get_old_dbsnps(marker)
            context['ic'] = ic_docs
            context['history'] = history_docs
            context['title'] = title
            context['jbrowse_tracks'] = "PydginRegions%2Cdbsnp146%2CEnsemblGenes"
            return context
        elif res.hits_total == 0:
            messages.error(request, 'Marker '+marker+' not found.')
            raise Http404()
Пример #45
0
def gene_page(request):
    """ Renders a gene page. """
    query_dict = request.GET
    gene = query_dict.get("g")
    if gene is None:
        messages.error(request, "No gene name given.")
        raise Http404()
    query = ElasticQuery(Query.ids(gene.split(",")))
    elastic = Search(query, idx=ElasticSettings.idx("GENE", "GENE"), size=5)
    res = elastic.search()
    if res.hits_total == 0:
        messages.error(request, "Gene(s) " + gene + " not found.")
    elif res.hits_total < 9:
        symbols = ", ".join([getattr(doc, "symbol") for doc in res.docs])
        context = {"genes": res.docs, "title": symbols, "criteria": get_criteria(res.docs, "gene", "symbol", "GENE")}
        return render(request, "gene/gene.html", context, content_type="text/html")
    raise Http404()
Пример #46
0
    def get_overlapping_hits(self, build, seqid, start, end):
        query_bool = BoolQuery(must_arr=[RangeQuery("build_info.start", lte=start),
                                         RangeQuery("build_info.end", gte=end)])
        or_filter = OrFilter(RangeQuery("build_info.start", gte=start, lte=end))
        or_filter.extend(RangeQuery("build_info.end", gte=start, lte=end)) \
                 .extend(query_bool)
        range_query = FilteredQuery(BoolQuery(must_arr=[Query.term("build_info.seqid", seqid),
                                                        Query.term("build_info.build", build)]),
                                    or_filter)

        query = ElasticQuery.filtered_bool(
            Query.nested("build_info", range_query),
            BoolQuery(must_arr=[RangeQuery("tier", lte=2)]),
            # sources=["disease", "marker", "chr_band", "tier", "build_info", "disease_locus"]
            )
        elastic = Search(search_query=query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'))
        return elastic.search().docs
Пример #47
0
    def fetch_disease_locus(cls, hits_docs):

        region_index = ElasticSettings.idx('REGIONS', idx_type='DISEASE_LOCUS')
        disease_loc_docs = []
        locus_id_set = set()
        for doc in hits_docs.docs:
                locus_id = getattr(doc, 'disease_locus')
                if locus_id not in locus_id_set:
                    locus_id_set.add(locus_id)
                    query = ElasticQuery(Query.ids([locus_id]))
                    elastic = Search(query, idx=region_index)
                    disease_loc = elastic.search().docs
                    if(len(disease_loc) == 1):
                        disease_loc_docs.append(disease_loc[0])
                    else:
                        logger.critical('disease_locus doc not found for it ' + locus_id)

        return disease_loc_docs
Пример #48
0
    def get_region(cls, request, region, context):
        if region is None:
            messages.error(request, 'No region given.')
            raise Http404()
        query = ElasticQuery(Query.ids(region.split(',')))
        elastic = Search(query, idx=ElasticSettings.idx('REGION', 'REGION'), size=5)
        res = elastic.search()
        if res.hits_total == 0:
            messages.error(request, 'Region(s) '+region+' not found.')
        elif res.hits_total < 9:
            context['features'] = [Region.pad_region_doc(doc) for doc in res.docs]

            fids = [doc.doc_id() for doc in res.docs]
            criteria_disease_tags = RegionView.criteria_disease_tags(request, fids)
            context['criteria'] = criteria_disease_tags

            context['title'] = ', '.join([getattr(doc, 'region_name') for doc in res.docs])
            return context
        raise Http404()
        def get_pmids(resp_json):
            pmids = []
            for hit in resp_json['hits']['hits']:
                doc = Document(hit)
                pmids.append(getattr(doc, "pmid"))

            pmids = list(set(pmids))
            elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))),
                                                       sources=['pmid']),
                             idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)

            if len(pmids) != elastic.get_count()['count']:
                # check for differences in pmids
                docs = elastic.search().docs
                pmids_in_pub_idx = [getattr(doc, 'pmid') for doc in docs]
                pmids_diff = list(set(pmids) - set(pmids_in_pub_idx))
                self.assertListEqual([], pmids_diff, "PMIDs list empty ("+str(pmids_diff)+")")

            self.assertEqual(len(pmids), elastic.get_count()['count'], 'Count for region publications')
Пример #50
0
    def get_comparison_results(cls, criteria_idx, criteria_idx_type, old_criteria_results, primary_id_type,
                               criteria_sub_class):
        query = ElasticQuery(Query.ids(list(old_criteria_results.keys())))
        elastic = Search(query, idx=criteria_idx, idx_type=criteria_idx_type, size=len(old_criteria_results))
        criteria_docs = elastic.search().docs

        print('Number of docs from new criteria elastic index for criteria type  ' +
              criteria_idx_type + '    ' + str(len(criteria_docs)))
        counter = 1
        comparison_result_list = []
        for criteria_doc in criteria_docs:
            print('==========' + str(counter) + '==========')
            print(criteria_doc.__dict__)
            counter = counter + 1
            current_id = getattr(criteria_doc, 'qid')
            comparison_result = cls.compare_dicts(criteria_doc.__dict__, old_criteria_results[current_id],
                                                  primary_id_type, criteria_sub_class, criteria_idx_type)
            if(len(comparison_result) > 0):
                comparison_result_list.append(comparison_result)

        return comparison_result_list
Пример #51
0
    def get_rdm_docs(cls,
                     idx,
                     idx_type,
                     qbool=Query.match_all(),
                     sources=[],
                     size=1):
        ''' Get a random doc from the indices. '''
        score_function1 = ScoreFunction.create_score_function(
            'random_score', seed=random.randint(0, 1000000))

        search_query = ElasticQuery(FunctionScoreQuery(qbool,
                                                       [score_function1],
                                                       boost_mode='replace'),
                                    sources=sources)
        elastic = Search(search_query=search_query,
                         size=size,
                         idx=idx,
                         idx_type=idx_type)
        try:
            return elastic.search().docs
        except IndexError:
            return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
Пример #52
0
        def check_hits(resp_json):
            self.assertTrue('hits' in resp_json, 'scan and scroll hits')
            self.assertGreaterEqual(len(resp_json['hits']['hits']), 1)
            docs = [Document(hit) for hit in resp_json['hits']['hits']]
            for doc1 in docs:
                doc_internal_id = getattr(doc1, "internal_id")
                if doc_internal_id in internal_id:
                    pos1 = self._get_highest_build(doc1)
                    for doc2 in internal_id[doc_internal_id]:
                        pos2 = self._get_highest_build(doc2)
                        if pos2['position'] != pos1['position']:
                            msg = ("DIFFERENT POSITIONS ID: "+str(doc_internal_id)+":\t" +
                                   str(getattr(doc1, "name"))+": "+pos1['position']+" ("+doc1.doc_id()+")\t" +
                                   str(getattr(doc2, "name"))+": "+pos2['position']+" ("+doc2.doc_id()+")\t")
                            try:
                                terms_filter = TermsFilter.get_terms_filter("start", [pos1['position'],
                                                                                      pos2['position']])
                                query = ElasticQuery.filtered(Query.term("seqid", pos1['seqid']), terms_filter)
                                elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'))
                                docs_by_pos = elastic.search().docs
                                found = False
                                for d in docs_by_pos:
                                    msg += getattr(d, "id")+": "+str(getattr(d, "start"))+"\t"
                                    if getattr(d, "id") == 'rs'+str(doc_internal_id):
                                        found = True

                                if not found:
                                    msg += 'rs'+str(doc_internal_id)
                                    if self._rs_exists('rs'+str(doc_internal_id)):
                                        msg += ' EXISTS IN DBSNP\t'
                                    else:
                                        msg += ' NOT IN DBSNP\t'
                                logger.error(msg)
                            except KeyError:
                                logger.error(msg)
                    internal_id[doc_internal_id].append(doc1)
                else:
                    internal_id[doc_internal_id] = [doc1]
    def test_nested_query(self):
        ''' Test nested query with aggregations. '''
        self.assertRaises(QueryError, Query.nested, 'build_info', 'xxxx')
        qnested = ElasticQuery(Query.nested('build_info', Query.term("build_info.build", "38")))

        diseases_by_seqid = Agg('diseases_by_seqid', 'terms', {"size": 0, "field": "disease"})
        disease_hits = Agg('disease_hits', 'reverse_nested', {}, sub_agg=diseases_by_seqid)
        seq_hits = Agg('seq_hits', 'terms', {'field': 'build_info.seqid', 'size': 0}, sub_agg=disease_hits)
        build_info = Agg('build_info', 'nested', {"path": 'build_info'}, sub_agg=[seq_hits])

        elastic = Search(qnested, idx=IDX['JSON_NESTED']['indexName'], aggs=Aggs(build_info))
        res = elastic.search()

        # returns just build 38 hits
        self.assertEqual(len(res.docs), 2)

        seq_hits = getattr(res.aggs['build_info'], 'seq_hits')['buckets']
        # two seq ids
        self.assertEqual(len(seq_hits), 2)
        for seq in seq_hits:
            disease_hits = seq['disease_hits']
            # one disease found on the sequence
            self.assertEqual(len(disease_hits['diseases_by_seqid']['buckets']), 1)
Пример #54
0
def show_es_gene_section(gene_symbol=None, seqid=None,
                         start_pos=None, end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    seqid = str(seqid).replace('chr', '')
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid),
                                         RangeQuery("start", lte=start_pos),
                                         RangeQuery("stop", gte=start_pos)])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid),
                                         RangeQuery("start", gte=start_pos),
                                         RangeQuery("stop", lte=end_pos)])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
Пример #55
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from Rserve. '''
        try:
            filterable = getattr(view, 'filter_fields', [])
            filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])

            mid1 = filters.get('m1', 'rs2476601')
            dataset = filters.get('dataset', 'EUR').replace('-', '')
            mid2 = filters.get("m2")
            window_size = int(filters.get('window_size', 1000000))
            dprime = filters.get("dprime", 0.)
            rsq = filters.get("rsq", 0.8)
            maf = filters.get("maf", False)

            if maf:
                maf = True
            build_version = filters.get("build", 'GRCh38').lower()
            pos = filters.get("pos", False)
            if pos:
                pos = True

            query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start'])
            elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1)
            doc = elastic.search().docs[0]
            seqid = getattr(doc, 'seqid')

            rserve = getattr(settings, 'RSERVE')
            conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT'))
            ld_str = conn.r.ld_run(dataset, seqid, mid1, marker2=mid2,
                                   window_size=window_size, dprime=dprime,
                                   rsq=rsq, maf=maf, position=pos, build_version=build_version)
            ld_str = ld_str.replace('D.prime', 'dprime').replace('R.squared', 'rsquared')
            conn.close()
            return [ElasticObject(initial=json.loads(str(ld_str)))]
        except (TypeError, ValueError, IndexError, ConnectionError):
            return [ElasticObject(initial={'ld': None})]
Пример #56
0
def _search_engine(query_dict, user_filters, user):
    ''' Carry out a search and add results to the context object. '''
    user_query = query_dict.get("query")
    query = _gene_lookup(user_query)

    source_filter = [
        'symbol', 'synonyms', "dbxrefs.*", 'biotype', 'description',  # gene
        'id', 'rscurrent', 'rshigh',                                  # marker
        'journal', 'title', 'tags.disease',                           # publication
        'name', 'code',                                               # disease
        'study_id', 'study_name',                                     # study
        'region_name', 'marker']                                      # regions

    if re.compile(r'^[0-9 ]+$').findall(query):
        source_filter.append('pmid')      # publication - possible PMID(s)
    search_fields = []
    maxsize = 20
    if user_filters.getlist("maxsize"):
        maxsize = int(user_filters.get("maxsize"))

    # build search_fields from user input filter fields
    for it in user_filters.items():
        if len(it) == 2:
            if it[0] == 'query':
                continue
            parts = it[1].split(":")
            if len(parts) == 3:
                search_fields.append(parts[1]+"."+parts[2])
            elif len(parts) == 2:
                search_fields.append(parts[1])

    if len(search_fields) == 0:
        search_fields = list(source_filter)
        search_fields.extend(['abstract', 'authors.name',   # publication
                              'authors', 'pmids',                    # study
                              'markers', 'genes'])                   # study/region
    source_filter.extend(['date', 'pmid', 'build_id', 'ref', 'alt', 'chr_band',
                          'disease_locus', 'disease_loci', 'region_id'])

    idx_name = query_dict.get("idx")
    idx_dict = ElasticSettings.search_props(idx_name, user)
    query_filters = _get_query_filters(user_filters, user)

    highlight = Highlight(search_fields, pre_tags="<strong>", post_tags="</strong>", number_of_fragments=0)
    sub_agg = Agg('idx_top_hits', 'top_hits', {"size": maxsize, "_source": source_filter,
                                               "highlight": highlight.highlight['highlight']})
    aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg),
                 Agg("biotypes", "terms", {"field": "biotype", "size": 0}),
                 Agg("categories", "terms", {"field": "_type", "size": 0})])

    # create score functions
    score_fns = _build_score_functions(idx_dict)
    equery = BoolQuery(must_arr=Query.query_string(query, fields=search_fields),
                       should_arr=_auth_arr(user),
                       b_filter=query_filters,
                       minimum_should_match=1)

    search_query = ElasticQuery(FunctionScoreQuery(equery, score_fns, boost_mode='replace'))
    elastic = Search(search_query=search_query, aggs=aggs, size=0,
                     idx=idx_dict['idx'], idx_type=idx_dict['idx_type'])
    result = elastic.search()

    mappings = elastic.get_mapping()
    _update_mapping_filters(mappings, result.aggs)
    _update_biotypes(user_filters, result)

    return {'data': _top_hits(result), 'aggs': result.aggs,
            'query': user_query, 'idx_name': idx_name,
            'fields': search_fields, 'mappings': mappings,
            'hits_total': result.hits_total,
            'maxsize': maxsize, 'took': result.took}
Пример #57
0
 def fetch_from_elastic(cls, idx, idx_type, feature_ids):
     '''Lookup pydgin elastic'''
     query = ElasticQuery(Query.ids(feature_ids))
     elastic = Search(query, idx=ElasticSettings.idx(idx, idx_type=idx_type), size=5)
     docs = elastic.search().docs
     return docs
 def test_string_query_with_wildcard_and_highlight(self):
     highlight = Highlight("id", pre_tags="<strong>", post_tags="</strong>")
     query = ElasticQuery.query_string("rs*", fields=["id"], highlight=highlight)
     search = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5)
     self.assertTrue(len(search.search().docs) > 1, "Elastic string query retrieved marker (rs*)")
Пример #59
0
    def test_elastic_group_name(self):
        '''
        Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/
        Testing various elastic queries

        idx doc:
         "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"},
         "seqid": "chr4", "source": "immunobase", "type": "region",
         "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373}
        idx_query:
        Private(in given group) OR Public
        -d '{"query":{"filtered":{"filter":{"bool": {
                                            "should": [
                                                        {"terms": {"group_name":["dil"]}},
                                                        { "missing": { "field": "group_name"   }}
                                                      ]
                                                    }}}}}'
        Private(in given group):
        -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}'
        Public:
        -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}},
-                         'query': {'term': {'match_all': '{}'}}}}}
        '''
        # get the groups for the given user
        response = self.client.post('/accounts/login/', {
            'username': '******',
            'password': '******'
        })
        self.assertTrue(response.status_code, "200")

        logged_in_user = User.objects.get(
            id=self.client.session['_auth_user_id'])
        if logged_in_user and logged_in_user.is_authenticated():
            user_groups = get_user_groups(logged_in_user)
            self.assertTrue('READ' in user_groups,
                            "user present in READ group")
            # make sure the user is not yet in DIL group
            self.assertFalse('DIL' in user_groups,
                             "user not present in DIL group")

        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names: group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) == 0, "No group present")

        # Match all query, as there is no group we do a match all
        query = ElasticQuery(Query.match_all())
        expected_query_string = {"query": {"match_all": {}}}
        self.assertJSONEqual(json.dumps(query.query),
                             json.dumps(expected_query_string),
                             "Query string matched")

        Search.index_refresh(self.index_name)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 12,
            "Elastic string query retrieved all public regions")

        # Filtered query for group names, add the user to DIL group and get the query string
        self.dil_group = Group.objects.create(name='DIL')
        logged_in_user.groups.add(self.dil_group)
        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names: group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) > 0, "More than 1 group present")
        self.assertTrue("dil" in group_names, "DIL group present")

        # retrieves all docs with missing field group_name - 11 docs
        terms_filter = TermsFilter.get_missing_terms_filter(
            "field", "attr.group_name")
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 11,
            "Elastic string query retrieved all public regions")

        # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs
        query_bool = BoolQuery()
        query_bool.should(Query.missing_terms("field", "group_name")) \
                  .should(Query.terms("group_name", group_names).query_wrap())

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 12,
            "Elastic string query retrieved both public + private regions")

        terms_filter = TermsFilter.get_terms_filter("attr.group_name",
                                                    group_names)
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 1,
            "Elastic string query retrieved one private regions")
        self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region")
        self.assertEqual(docs[0].attr['region_id'], "803",
                         "type matched region")
        self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]",
                         "type matched region")
    def test_elastic_group_name(self):
        '''
        Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/
        Testing various elastic queries

        idx doc:
         "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"},
         "seqid": "chr4", "source": "immunobase", "type": "region",
         "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373}
        idx_query:
        Private(in given group) OR Public
        -d '{"query":{"filtered":{"filter":{"bool": {
                                            "should": [
                                                        {"terms": {"group_name":["dil"]}},
                                                        { "missing": { "field": "group_name"   }}
                                                      ]
                                                    }}}}}'
        Private(in given group):
        -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}'
        Public:
        -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}},
-                         'query': {'term': {'match_all': '{}'}}}}}
        '''
        # get the groups for the given user
        response = self.client.post('/accounts/login/', {'username': '******', 'password': '******'})
        self.assertTrue(response.status_code, "200")

        logged_in_user = User.objects.get(id=self.client.session['_auth_user_id'])
        if logged_in_user and logged_in_user.is_authenticated():
            user_groups = get_user_groups(logged_in_user)
            self.assertTrue('READ' in user_groups, "user present in READ group")
            # make sure the user is not yet in DIL group
            self.assertFalse('DIL' in user_groups, "user not present in DIL group")

        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names : group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) == 0, "No group present")

        # Match all query, as there is no group we do a match all
        query = ElasticQuery(Query.match_all())
        expected_query_string = {"query": {"match_all": {}}}
        self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched")

        Search.index_refresh(self.index_name)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 12, "Elastic string query retrieved all public regions")

        # Filtered query for group names, add the user to DIL group and get the query string
        self.dil_group = Group.objects.create(name='DIL')
        logged_in_user.groups.add(self.dil_group)
        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names : group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) > 0, "More than 1 group present")
        self.assertTrue("dil" in group_names, "DIL group present")

        # retrieves all docs with missing field group_name - 11 docs
        terms_filter = TermsFilter.get_missing_terms_filter("field", "attr.group_name")
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 11, "Elastic string query retrieved all public regions")

        # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs
        query_bool = BoolQuery()
        query_bool.should(Query.missing_terms("field", "group_name")) \
                  .should(Query.terms("group_name", group_names).query_wrap())

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 12, "Elastic string query retrieved both public + private regions")

        terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names)
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(len(docs) == 1, "Elastic string query retrieved one private regions")
        self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region")
        self.assertEqual(docs[0].attr['region_id'], "803", "type matched region")
        self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")