Exemplo n.º 1
0
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    if seqid is not None and isinstance(seqid,
                                        str) and seqid.startswith("chr"):
        seqid = seqid
    else:
        seqid = 'chr' + str(seqid)
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("gene_symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", lte=start_pos),
            RangeQuery("featureloc.end", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", gte=start_pos),
            RangeQuery("featureloc.end", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
Exemplo n.º 2
0
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    seqid = str(seqid).replace('chr', '')
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", lte=start_pos),
            RangeQuery("stop", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", gte=start_pos),
            RangeQuery("stop", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
 def test_bool_query(self):
     ''' Test a bool query. '''
     query_bool = BoolQuery()
     highlight = Highlight(["id", "seqid"])
     query_bool.must(Query.term("id", "rs768019142")) \
               .must(RangeQuery("start", gt=1000)) \
               .must_not(Query.match("seqid", "2")) \
               .should(Query.match("seqid", "3")) \
               .should(Query.match("seqid", "1"))
     query = ElasticQuery.bool(query_bool, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
    def test_bool_nested_filter(self):
        ''' Test combined Bool filter '''
        query_bool_nest = BoolQuery()
        query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \
                       .must(Query.term("seqid", 1))

        query_bool = BoolQuery()
        query_bool.should(query_bool_nest) \
                  .should(Query.term("seqid", 2))
        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
    def test_bool_filtered_query4(self):
        ''' Test building and running a filtered boolean query.
        Note: ElasticQuery used to wrap match in a query object. '''
        query_bool = BoolQuery()
        query_bool.should(RangeQuery("start", lte=20000)) \
                  .should(Query.term("seqid", 2)) \
                  .must(Query.match("id", "rs768019142").query_wrap()) \
                  .must(Query.term("seqid", 1))

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
Exemplo n.º 6
0
    def _get_current_build_info(self, seqid, position):
        ''' Get upper & lower boundaries for a hit given the position of the marker.'''

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("position", gte=position),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:asc'),
                        size=1).search()
        genetic_map_position = getattr(result.docs[0], "genetic_map_position")

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("genetic_map_position",
                           gte=(genetic_map_position + 0.1)),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:asc'),
                        size=1).search()
        start = int(getattr(result.docs[0], "position"))

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("genetic_map_position",
                           lte=(genetic_map_position - 0.1)),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:desc'),
                        size=1).search()
        end = int(getattr(result.docs[0], "position"))

        build_info = {'build': 38, 'seqid': seqid, 'start': start, 'end': end}
        return build_info
    def test_mapping_parent_child(self):
        ''' Test creating mapping with parent child relationship. '''
        gene_mapping = MappingProperties("gene")
        gene_mapping.add_property("symbol", "string", analyzer="full_name")
        inta_mapping = MappingProperties("publication", "gene")
        load = Loader()
        idx = "test__mapping__"+SEARCH_SUFFIX
        options = {"indexName": idx, "shards": 1}
        requests.delete(ElasticSettings.url() + '/' + idx)

        # add child mappings first
        status = load.mapping(inta_mapping, "publication", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping inteactions")
        status = load.mapping(gene_mapping, "gene", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping genes")

        ''' load docs and test has parent query'''
        json_data = '{"index": {"_index": "%s", "_type": "gene", "_id" : "1"}}\n' % idx
        json_data += json.dumps({"symbol": "PAX1"}) + '\n'
        json_data += '{"index": {"_index": "%s", "_type": "publication", "_id" : "2", "parent": "1"}}\n' % idx
        json_data += json.dumps({"pubmed": 1234}) + '\n'
        Bulk.load(idx, '', json_data)
        Search.index_refresh(idx)
        query = ElasticQuery.has_parent('gene', Query.match('symbol', 'PAX1'))
        elastic = Search(query, idx=idx, idx_type='publication', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'pubmed'), 1234)
        self.assertEquals(docs[0].parent(), '1')
        self.assertRaises(QueryError, ElasticQuery.has_parent, 'gene', 'xxxxx')

        ''' test has child query '''
        query = ElasticQuery.has_child('publication', Query.match('pubmed', 1234))
        elastic = Search(query, idx=idx, idx_type='gene', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'symbol'), 'PAX1')
        self.assertEquals(docs[0].parent(), None)
        requests.delete(ElasticSettings.url() + '/' + idx)
Exemplo n.º 8
0
def show_es_gene_section(gene_symbol=None, seqid=None,
                         start_pos=None, end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    seqid = str(seqid).replace('chr', '')
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid),
                                         RangeQuery("start", lte=start_pos),
                                         RangeQuery("stop", gte=start_pos)])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid),
                                         RangeQuery("start", gte=start_pos),
                                         RangeQuery("stop", lte=end_pos)])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
Exemplo n.º 9
0
    def query_match(cls, match_id, match_str, sources=None, highlight=None):
        ''' Factory method for creating elastic Match Query.

        @type  match_id: string
        @param match_id: The match id.
        @type  match_str: string
        @param match_str: The string value to match.
        @type  sources: array of result fields
        @keyword sources: The _source filtering to be used (default: None).
        @type  highlight: Highlight
        @keyword highlight: Define the highlighting of results (default: None).
        @return: L{ElasticQuery}
        '''
        query = Query.match(match_id, match_str)
        return cls(query, sources, highlight)
Exemplo n.º 10
0
    def _get_chr_band(self, seqid, position):
        ''' Get chr band for a given chr/position '''
        if seqid == 6 and position >= 24891793 and position <= 34924245:
            return 'MHC'

        query = ElasticQuery(
            BoolQuery(must_arr=[
                Query.match("seqid", seqid),
                RangeQuery("start", lte=position),
                RangeQuery("stop", gte=position)
            ]))
        result = Search(query, idx=ElasticSettings.idx('BAND', 'BAND'),
                        size=1).search()
        return (getattr(result.docs[0], "seqid") +
                getattr(result.docs[0], "name"))
Exemplo n.º 11
0
    def pad_region_doc(cls, region):
        '''Adds details of disease_loci & hits for a given region doc'''
        hits_idx = ElasticSettings.idx('REGION', 'STUDY_HITS')

        disease_loci = getattr(region, "disease_loci")

        locus_start = Agg('region_start', 'min', {'field': 'build_info.start'})
        locus_end = Agg('region_end', 'max', {'field': 'build_info.end'})
        match_agg = Agg('filtered_result', 'filter', Query.match("build_info.build", 38).query_wrap(),
                        sub_agg=[locus_start, locus_end])
        build_info_agg = Agg('build_info', 'nested', {"path": 'build_info'}, sub_agg=[match_agg])

        query = ElasticQuery(FilteredQuery(Query.terms("disease_locus", disease_loci),
                                           Filter(BoolQuery(should_arr=[Query.missing_terms("field", "group_name")]
                                                            ))))
        resultObj = Search(search_query=query, idx=hits_idx, aggs=Aggs(build_info_agg)).search()

        hit_ids = []
        markers = []
        genes = []
        studies = []
        pmids = []
        for doc in resultObj.docs:
            hit_ids.append(doc.doc_id())
            markers.append(getattr(doc, "marker"))
            if hasattr(doc, "genes") and getattr(doc, "genes") != None:
                genes.extend([g for g in getattr(doc, "genes")])
            studies.append(getattr(doc, "dil_study_id"))
            pmids.append(getattr(doc, "pmid"))

        build_info = getattr(resultObj.aggs['build_info'], 'filtered_result')
        region_start = int(build_info['region_start']['value'])
        region_end = int(build_info['region_end']['value'])

        build_info = {
            'build': 38,
            'seqid': getattr(region, "seqid"),
            'start': region_start,
            'end': region_end
        }
        setattr(region, "build_info", build_info)
        setattr(region, "hits", hit_ids)
        setattr(region, "markers", list(set(markers)))
        setattr(region, "genes", list(set(genes)))
        setattr(region, "studies", list(set(studies)))
        setattr(region, "pmids", list(set(pmids)))

        return region
Exemplo n.º 12
0
def chicpeaSearch(request, url):
    queryDict = request.GET
    user = request.user
    targetIdx = queryDict.get("targetIdx")
    blueprint = {}
    hic = []
    addList = []
    searchType = 'gene'
    searchTerm = queryDict.get("searchTerm").upper()
    searchTerm = searchTerm.replace(",", "")
    searchTerm = searchTerm.replace("..", "-")
    searchTerm = searchTerm.replace(" ", "") # Chris suggestion to prevent issue with spaces in queries
    snpTrack = queryDict.get("snp_track")

    (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types(
                                            user=user, idx_keys=None, idx_type_keys=None)

    if snpTrack:
        mo = re.match(r"(.*)-(.*)", snpTrack)
        (group, track) = mo.group(1, 2)  # @UnusedVariable
        if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth:
            snpTrack = None

    if targetIdx not in utils.tissues:
        for target in getattr(chicp_settings, 'CP_TARGET'):
            if 'CP_TARGET_'+target not in idx_keys_auth:
                if targetIdx == target:
                    retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'}
                    return JsonResponse(retJSON)
                continue
            elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target")
            tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)]
                              ['mappings']['gene_target']['_meta']['tissue_type'].keys())
            utils.tissues['CP_TARGET_'+target] = tissueList

    if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm):
        searchType = 'region'
        region = searchTerm
        if queryDict.get("region"):
            region = queryDict.get("region")
        else:
            searchTerm = ""
        mo = re.match(r"(.*):(\d+)-(\d+)", region)
        (chrom, segmin, segmax) = mo.group(1, 2, 3)
        chrom = chrom.replace('chr', "")
        chrom = chrom.replace('CHR', "")
    if re.search("^rs[0-9]+", searchTerm.lower()):
        searchTerm = searchTerm.lower()
        addList.append(_find_snp_position(snpTrack, searchTerm))
        if addList[0].get("error"):
            return JsonResponse({'error': addList[0]['error']})
        position = addList[0]['end']
        if searchType != 'region':
            searchType = 'snp'

    logger.warn("### "+searchType+" - "+searchTerm+' ###')

    if searchType == 'region':
        query_bool = BoolQuery()
        filter_bool = BoolQuery()
        if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)",
                                                         queryDict.get("searchTerm").replace(",", "")) == None:
            query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]),
                             Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
        else:
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])

        query_bool = _add_tissue_filter(query_bool, targetIdx)

        if len(addList) > 0:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])
        else:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax),
                                                    RangeQuery("baitEnd", gte=segmin, lte=segmax)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax),
                                                    RangeQuery("oeEnd", gte=segmin, lte=segmax)])])

        query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                           sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
        (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax)  # @UnusedVariable

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'}
            return JsonResponse(retJSON)

    elif searchType == 'snp':
        if len(addList) > 0:
            chrom = addList[0]['chr']

            query_bool = BoolQuery()
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
            query_bool = _add_tissue_filter(query_bool, targetIdx)

            filter_bool = BoolQuery()
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])

            query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                               sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
            hic, segmin, segmax = _build_hic_query(query, targetIdx)

            if "error" in hic:
                return JsonResponse(hic)
            if len(hic) == 0:
                retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'}
                return JsonResponse(retJSON)
    else:
        # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"])
        geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap()))
        resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/',
                           search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search()
        if resultObj.hits_total > 1:
            geneResults = []
            resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery,
                                size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search()

            docs = resultObj2.docs
            gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs]

            query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids))
            agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0})
            res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg),
                         size=0).search()

            ensg_count = res.aggs['ensg_agg'].get_buckets()
            gene_ids = [g['key'] for g in ensg_count]

            for d in resultObj2.docs:
                if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids:
                    geneResults.append({
                        'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''),
                        'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''),
                        'location': "chr" + getattr(d, "seqid") + ":" +
                        locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." +
                        locale.format_string("%d", getattr(d, "end"), grouping=True),
                    })

            if len(geneResults) == 0:
                retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
                return JsonResponse(retJSON)
            elif len(geneResults) > 1:
                retJSON = {
                    'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.',
                    'results': geneResults,
                    'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location']
                }
                return JsonResponse(retJSON)

        query_bool = BoolQuery()
        query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)])
        query_bool = _add_tissue_filter(query_bool, targetIdx)
        query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]),
                                           query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])

        (hic, segmin, segmax) = _build_hic_query(query, targetIdx)

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
            return JsonResponse(retJSON)
        chrom = hic[0]['baitChr']

    try:
        chrom
    except NameError:
        retJSON = {'error': 'No chromosome defined for search'}
        return JsonResponse(retJSON)

    # get genes based on this segment
    genes = _build_gene_query(chrom, segmin, segmax)
    (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax)
    frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax)

    addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList)

    retJSON = {"hic": hic,
               "frags": frags,
               "meta": {"ostart": int(segmin),
                        "oend": int(segmax),
                        "rstart": 1,
                        "rend": int(segmax) - int(segmin),
                        "rchr": str(chrom),
                        "tissues": utils.tissues['CP_TARGET_'+targetIdx]},
               "snps": snps,
               "snp_meta": snpMeta,
               "genes": genes,
               "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax),
               "blueprint": blueprint,
               "extra": addList
               }

    response = JsonResponse(retJSON)
    return response
Exemplo n.º 13
0
    def add_study_data(self, **options):
        ''' add gwas stats from a study '''
        study = options['study_id']
        file = options['addStudyData']
        message = ""
        print("Deleting study hits for " + study)
        Delete.docs_by_query(ElasticSettings.idx('REGION', 'STUDY_HITS'),
                             query=Query.term("dil_study_id", study))

        with open(file, newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for row in reader:
                if row[0] == 'Marker':
                    continue
                # 0 - Marker
                # 1 - disease
                # 2 - Chromosome
                # 3 - Region Start
                # 4 - Region End
                # 5 - Position
                # 6 - Strand
                # 7 - Major Allele
                # 8 - Minor allele
                # 9 - Minor allele frequency
                # 10 - Discovery P value
                # 11 - Discovery Odds ratio
                # 12 - Discovery 95% confidence interval lower limit
                # 13 - Discovery 95% confidence interval upper limit
                # 14 - Replication P value
                # 15 - Replication Odds ratio
                # 16 - Replication 95% confidence interval lower limit
                # 17 - Replication 95% confidence interval upper limit
                # 18 - Combined P value
                # 19 - Combined Odds ratio
                # 20 - Combined 95% confidence interval lower limit
                # 21 - Combined 95% confidence interval upper limit
                # 22 - PP Colocalisation
                # 23 - Gene
                # 24 - PubMed ID
                # 25 - Other Signal
                # 26 - Notes
                # 27 - Curation status/ failed quality control

                query = ElasticQuery(Query.match("id", row[0]))
                result = Search(search_query=query,
                                idx=ElasticSettings.idx('MARKER',
                                                        'MARKER')).search()
                if result.hits_total == 0:
                    result2 = Search(search_query=ElasticQuery(
                        Query.match("rshigh", row[0])),
                                     idx=ElasticSettings.idx(
                                         'MARKER', 'HISTORY')).search()
                    if result2.hits_total > 0:
                        history_doc = result2.docs[0]
                        new_id = getattr(history_doc, "rscurrent")
                        query = ElasticQuery(Query.match("id", new_id))
                        result = Search(search_query=query,
                                        idx=ElasticSettings.idx(
                                            'MARKER', 'MARKER')).search()

                if result.hits_total != 1:
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Marker cannot be found; <br />\n"

                marker = result.docs[0]

                query = ElasticQuery(Query.match("code", row[1]))
                result = Search(query,
                                idx=ElasticSettings.idx('DISEASE',
                                                        'DISEASE')).search()
                if result.hits_total != 1:
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Disease cannot be found; <br />\n"
                    continue
                disease = result.docs[0]

                if not re.match(r"^\w$", row[7]):
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Major allele is not set; <br />\n"
                    continue
                if not re.match(r"^\w$", row[8]):
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Minor allele is not set; <br />\n"
                    continue
                if float(row[9]) > 0.5:
                    message += "WARNING - MAF for " + row[
                        0] + " is >0.5; <br />\n"

                strand = row[6]
                if re.match(r"\d", strand):
                    strand = '+' if strand > 0 else '-'
                row[6] = strand

                if not re.match(r"\d+", row[2]):
                    row[2] = getattr(marker, "seqid")
                if not re.match(r"\d+", row[5]):
                    row[5] = getattr(marker, "start")
                if not row[5] == getattr(marker, "start"):
                    row[5] = getattr(marker, "start")

                data = {
                    "chr_band": self._get_chr_band(row[2], row[5]),
                    "other_signal": row[25],
                    "species": "Human",
                    "disease": getattr(disease, "code"),
                    "notes": row[26],
                    "disease_locus": "TBC",
                    "dil_study_id": study,
                    "marker": getattr(marker, "id"),
                    "status": "N",
                    "pp_probability": row[22],
                    "tier": 100,
                    "pmid": row[24],
                    "genes": self._get_ens_gene(row[23])
                }

                build_info = self._get_current_build_info(row[2], row[5])
                data['build_info'] = [build_info]

                data['p_values'] = {
                    'discovery': row[10],
                    'replication': row[14],
                    'combined': row[18]
                }

                data['odds_ratios'] = {
                    'discovery': {
                        "or": row[11],
                        "lower": row[12],
                        "upper": row[13]
                    },
                    'replication': {
                        "or": row[15],
                        "lower": row[16],
                        "upper": row[17]
                    },
                    'combined': {
                        "or": row[19],
                        "lower": row[20],
                        "upper": row[21]
                    }
                }

                data['alleles'] = {
                    'major': row[7],
                    'minor': row[8],
                    'maf': row[9]
                }

                data['suggest'] = {'input': [], 'weight': 1}

                r = Search.elastic_request(
                    ElasticSettings.url(),
                    ElasticSettings.idx('REGION', 'STUDY_HITS'),
                    json.dumps(data))
                if r.status_code != 201:
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Failed to create document; <br />\n"

        print("\n\n" + message)
Exemplo n.º 14
0
def chicpeaSearch(request, url):
    queryDict = request.GET
    user = request.user
    targetIdx = queryDict.get("targetIdx")
    blueprint = {}
    hic = []
    addList = []
    searchType = 'gene'
    searchTerm = queryDict.get("searchTerm").upper()
    searchTerm = searchTerm.replace(",", "")
    searchTerm = searchTerm.replace("..", "-")
    snpTrack = queryDict.get("snp_track")

    (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types(
                                            user=user, idx_keys=None, idx_type_keys=None)

    if snpTrack:
        mo = re.match(r"(.*)-(.*)", snpTrack)
        (group, track) = mo.group(1, 2)  # @UnusedVariable
        if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth:
            snpTrack = None

    if targetIdx not in utils.tissues:
        for target in getattr(chicp_settings, 'CP_TARGET'):
            if 'CP_TARGET_'+target not in idx_keys_auth:
                if targetIdx == target:
                    retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'}
                    return JsonResponse(retJSON)
                continue
            elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target")
            tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)]
                              ['mappings']['gene_target']['_meta']['tissue_type'].keys())
            utils.tissues['CP_TARGET_'+target] = tissueList

    if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm):
        searchType = 'region'
        region = searchTerm
        if queryDict.get("region"):
            region = queryDict.get("region")
        else:
            searchTerm = ""
        mo = re.match(r"(.*):(\d+)-(\d+)", region)
        (chrom, segmin, segmax) = mo.group(1, 2, 3)
        chrom = chrom.replace('chr', "")
        chrom = chrom.replace('CHR', "")
    if re.search("^rs[0-9]+", searchTerm.lower()):
        searchTerm = searchTerm.lower()
        addList.append(_find_snp_position(snpTrack, searchTerm))
        if addList[0].get("error"):
            return JsonResponse({'error': addList[0]['error']})
        position = addList[0]['end']
        if searchType != 'region':
            searchType = 'snp'

    logger.warn("### "+searchType+" - "+searchTerm+' ###')

    if searchType == 'region':
        query_bool = BoolQuery()
        filter_bool = BoolQuery()
        if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)",
                                                         queryDict.get("searchTerm").replace(",", "")) == None:
            query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]),
                             Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
        else:
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])

        query_bool = _add_tissue_filter(query_bool, targetIdx)

        if len(addList) > 0:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])
        else:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax),
                                                    RangeQuery("baitEnd", gte=segmin, lte=segmax)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax),
                                                    RangeQuery("oeEnd", gte=segmin, lte=segmax)])])

        query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                           sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
        (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax)  # @UnusedVariable

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'}
            return JsonResponse(retJSON)

    elif searchType == 'snp':
        if len(addList) > 0:
            chrom = addList[0]['chr']

            query_bool = BoolQuery()
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
            query_bool = _add_tissue_filter(query_bool, targetIdx)

            filter_bool = BoolQuery()
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])

            query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                               sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
            hic, segmin, segmax = _build_hic_query(query, targetIdx)

            if "error" in hic:
                return JsonResponse(hic)
            if len(hic) == 0:
                retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'}
                return JsonResponse(retJSON)
    else:
        # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"])
        geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap()))
        resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/',
                           search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search()
        if resultObj.hits_total > 1:
            geneResults = []
            resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery,
                                size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search()

            docs = resultObj2.docs
            gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs]

            query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids))
            agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0})
            res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg),
                         size=0).search()

            ensg_count = res.aggs['ensg_agg'].get_buckets()
            gene_ids = [g['key'] for g in ensg_count]

            for d in resultObj2.docs:
                if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids:
                    geneResults.append({
                        'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''),
                        'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''),
                        'location': "chr" + getattr(d, "seqid") + ":" +
                        locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." +
                        locale.format_string("%d", getattr(d, "end"), grouping=True),
                    })

            if len(geneResults) == 0:
                retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
                return JsonResponse(retJSON)
            elif len(geneResults) > 1:
                retJSON = {
                    'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.',
                    'results': geneResults,
                    'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location']
                }
                return JsonResponse(retJSON)

        query_bool = BoolQuery()
        query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)])
        query_bool = _add_tissue_filter(query_bool, targetIdx)
        query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]),
                                           query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])

        (hic, segmin, segmax) = _build_hic_query(query, targetIdx)

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
            return JsonResponse(retJSON)
        chrom = hic[0]['baitChr']

    try:
        chrom
    except NameError:
        retJSON = {'error': 'No chromosome defined for search'}
        return JsonResponse(retJSON)

    # get genes based on this segment
    genes = _build_gene_query(chrom, segmin, segmax)
    (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax)
    frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax)

    addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList)

    retJSON = {"hic": hic,
               "frags": frags,
               "meta": {"ostart": int(segmin),
                        "oend": int(segmax),
                        "rstart": 1,
                        "rend": int(segmax) - int(segmin),
                        "rchr": str(chrom),
                        "tissues": utils.tissues['CP_TARGET_'+targetIdx]},
               "snps": snps,
               "snp_meta": snpMeta,
               "genes": genes,
               "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax),
               "blueprint": blueprint,
               "extra": addList
               }

    response = JsonResponse(retJSON)
    return response