def get_elastic_settings_with_user_uploads(cls, elastic_dict=None, new_upload_file=None):
        '''Get the updated elastic settings with user uploaded idx_types'''

        idx_key = 'CP_STATS_UD'
        idx = ElasticSettings.idx(idx_key)

        ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. '''  # @IgnorePep8
        elastic_url = ElasticSettings.url()
        url = idx + '/_mapping'
        response = Search.elastic_request(elastic_url, url, is_post=False)
        ''' why don't we use Search.get_mapping ? I guess it's not a class method'''
        #logger.debug(response.json())
        if "error" in response.json():
            logger.warn(response.json())
            return None

        # get idx_types from _mapping
        elastic_mapping = json.loads(response.content.decode("utf-8"))
        # here if we use aliasing then idx can be different
        # this causes problems as it's effectively hardcoded
       # this should fix to handle things where aliases are deployed
        idx = list(elastic_mapping.keys())[0]
        idx_types = list(elastic_mapping[idx]['mappings'].keys())

        if elastic_dict is None:
            elastic_dict = ElasticSettings.attrs().get('IDX')

        idx_type_dict = {}

        existing_ct = [ct.name for ct in ContentType.objects.filter(app_label=cls.PERMISSION_MODEL_APP_NAME)]

        for idx_type in idx_types:

            idx_type_with_suffix = idx_type + cls.PERMISSION_MODEL_TYPE_SUFFIX

            for ct in existing_ct:
                if ct.endswith(idx_type_with_suffix):

                    meta_url = idx + '/' + idx_type + '/_meta/_source'
                    meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False)

                    try:
                        elastic_meta = json.loads(meta_response.content.decode("utf-8"))
                        label = elastic_meta['label']
                    except:
                        label = "UD-" + idx_type

                    idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type}

        if new_upload_file is not None:
            idx_type = new_upload_file
            label = "UD-" + idx_type
            idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type}

        elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict
        return elastic_dict
    def get_models_to_delete(self):
        '''Get models to delete'''
        idx_key = 'CP_STATS_UD'
        idx = ElasticSettings.idx(idx_key)
        ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. '''  # @IgnorePep8
        elastic_url = ElasticSettings.url()
        url = idx + '/_mapping'
        response = Search.elastic_request(elastic_url, url, is_post=False)

        if "error" in response.json():
            logger.warn(response.json())
            return None

        # get idx_types from _mapping
        elastic_mapping = json.loads(response.content.decode("utf-8"))
        ## fix needed if we deploy aliasing for indices
        idx = list(elastic_mapping.keys())[0]
        idx_types = list(elastic_mapping[idx]['mappings'].keys())

        models2go = []
        expire_days = 7  # 1 week

        # add idx_types that have no docs
        for idx_type in idx_types:
            ndocs = Search(idx=idx, idx_type=idx_type).get_count()['count']

            if (ndocs > 0):
                models2go.append(idx_type)

            # add idx_types that were not accessed for a given time period
            url = idx + '/' + idx_type + '/_meta'
            response = Search.elastic_request(elastic_url, url, is_post=False)
            elastic_meta = json.loads(response.content.decode("utf-8"))
            if '_source' in elastic_meta:
                uploaded_str_date = elastic_meta['_source']['uploaded']
                yymmdd_str = uploaded_str_date.split()[0]
                # Format: 2015-11-03 14:43:54.099645+00:00
                from datetime import datetime as dt
                dt = dt.strptime(yymmdd_str, '%Y-%m-%d')
                uploaded_date = dt.date()

                d1 = datetime.date.today()
                d2 = d1 - datetime.timedelta(days=expire_days)
                if uploaded_date < d2:
                    models2go.append(idx_type)

        return models2go
    def get_models_to_delete(self):
        """Get models to delete"""
        idx_key = "CP_STATS_UD"
        idx = ElasticSettings.idx(idx_key)
        """ Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. """  # @IgnorePep8
        elastic_url = ElasticSettings.url()
        url = idx + "/_mapping"
        response = Search.elastic_request(elastic_url, url, is_post=False)

        if "error" in response.json():
            logger.warn(response.json())
            return None

        # get idx_types from _mapping
        elastic_mapping = json.loads(response.content.decode("utf-8"))
        idx_types = list(elastic_mapping[idx]["mappings"].keys())

        models2go = []
        expire_days = 7  # 1 weeks

        # add idx_types that have no docs
        for idx_type in idx_types:
            ndocs = Search(idx=idx, idx_type=idx_type).get_count()["count"]

            if ndocs <= 1:
                models2go.append(idx_type)

            # add idx_types that were not accessed for a given time period
            url = idx + "/" + idx_type + "/_meta"
            response = Search.elastic_request(elastic_url, url, is_post=False)
            elastic_meta = json.loads(response.content.decode("utf-8"))
            if "_source" in elastic_meta:
                uploaded_str_date = elastic_meta["_source"]["uploaded"]
                yymmdd_str = uploaded_str_date.split()[0]
                # Format: 2015-11-03 14:43:54.099645+00:00
                from datetime import datetime as dt

                dt = dt.strptime(yymmdd_str, "%Y-%m-%d")
                uploaded_date = dt.date()

                d1 = datetime.date.today()
                d2 = d1 - datetime.timedelta(days=expire_days)
                if uploaded_date < d2:
                    models2go.append(idx_type)

        return models2go
Пример #4
0
    def get_meta_info(cls, idx, idx_type):
        elastic_url = ElasticSettings.url()
        meta_url = idx + '/' + idx_type + '/_mapping'
        # print(elastic_url + meta_url)
        meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False)

        try:
            elastic_meta = json.loads(meta_response.content.decode("utf-8"))
            meta_info = elastic_meta[idx]['mappings'][idx_type]['_meta']
            return meta_info
        except:
            return None
    def get_elastic_settings_with_user_uploads(cls, elastic_dict=None):
        '''Get the updated elastic settings with user uploaded idx_types'''

        idx_key = 'CP_STATS_UD'
        idx = ElasticSettings.idx(idx_key)

        ''' Check if an index exists. '''
        elastic_url = ElasticSettings.url()
        url = idx + '/_mapping'
        response = Search.elastic_request(elastic_url, url, is_post=False)

        if "error" in response.json():
            logger.warn(response.json())
            return None

        # get idx_types from _mapping
        elastic_mapping = json.loads(response.content.decode("utf-8"))
        idx_types = list(elastic_mapping[idx]['mappings'].keys())

        if elastic_dict is None:
            elastic_dict = ElasticSettings.attrs().get('IDX')

        idx_type_dict = {}

        for idx_type in idx_types:

            meta_url = idx + '/' + idx_type + '/_meta/_source'
            meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False)

            try:
                elastic_meta = json.loads(meta_response.content.decode("utf-8"))
                label = elastic_meta['label']
            except:
                label = "UD-" + idx_type

            idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type}

        elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict
        return elastic_dict
Пример #6
0
    def get_criteria_index_types(cls, idx_key):

        idx = ElasticSettings.idx(idx_key)
        elastic_url = ElasticSettings.url()
        url = idx + '/_mappings'
        response = Search.elastic_request(elastic_url, url, is_post=False)

        if "error" in response.json():
            logger.warn(response.json())
            return None

        # get idx_types from _mapping
        elastic_mapping = json.loads(response.content.decode("utf-8"))
        idx_types = list(elastic_mapping[idx]['mappings'].keys())
        return idx_types
Пример #7
0
 def test_criteria_mappings(self, idx, idx_types):
     (main_codes, other_codes) = CriteriaManager.get_available_diseases()
     site_enabled_diseases = main_codes + other_codes
     elastic_url = ElasticSettings.url()
     for idx_type in idx_types:
         url = idx + '/' + idx_type + '/_mapping'
         response = Search.elastic_request(elastic_url, url, is_post=False)
         elastic_type_mapping = json.loads(response.content.decode("utf-8"))
         property_keys = list(elastic_type_mapping[idx]['mappings'][idx_type]['properties'].keys())
         '''check if score and disease_tags and qid are there in mapping'''
         self.assertIn('score', property_keys)
         self.assertIn('disease_tags', property_keys)
         self.assertIn('qid', property_keys)
         '''check if all the enabled diseases are there'''
         for disease in site_enabled_diseases:
             self.assertIn(disease, property_keys)
    def get_context_models_to_delete(self, *args, **options):
        '''Get models to delete'''
        ct = options['content_type']
        retDict = dict()
        retDict['acknowledged'] = 0
        logger.debug(ct)
        idx_key = 'CP_STATS_UD'
        idx = ElasticSettings.idx(idx_key)
        ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. '''  # @IgnorePep8
        elastic_url = ElasticSettings.url()
        url = idx + '/_mapping'
        response = Search.elastic_request(elastic_url, url, is_post=False)
        if "error" in response.json():
            logger.warn(response.json())
            retDict['errorMsg'] = response.json()
            self.stdout.write(json.dumps(retDict))

        # get idx_types from _mapping
        elastic_mapping = json.loads(response.content.decode("utf-8"))
        ## fix needed if we deploy aliasing for indices
        idx = list(elastic_mapping.keys())[0]
        idx_types = list(elastic_mapping[idx]['mappings'].keys())
        logger.debug(idx_types)

        # add idx_types that have no docs
        for idx_type in idx_types:
            if idx_type != ct:
                continue
            logger.debug("Found " + idx_type + "  equal to " + ct)
            ndocs = Search(idx=idx, idx_type=idx_type).get_count()['count']
            #logger.debug(Search(idx=idx, idx_type=idx_type).get_json_response())
            logger.debug("WE have " + str(ndocs))
            if (ndocs > 0):
                for cnt in ContentType.objects.filter():
                    if str(cnt.name).endswith(ct + '_idx_type'):
                        logger.debug(
                            'Matched, finding permissions for %s  %s' %
                            (str(cnt.name), str(cnt.id)))
                        logger.debug("deleting %s" % ct)
                        cnt.delete()
        retDict['acknowledged'] = 1
        #logger.debug(retDict)
        self.stdout.write(json.dumps(retDict))
    def get_elastic_settings_with_user_uploads(cls,
                                               elastic_dict=None,
                                               new_upload_file=None):
        '''Get the updated elastic settings with user uploaded idx_types'''

        idx_key = 'CP_STATS_UD'
        idx = ElasticSettings.idx(idx_key)
        ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. '''  # @IgnorePep8
        elastic_url = ElasticSettings.url()
        url = idx + '/_mapping'
        response = Search.elastic_request(elastic_url, url, is_post=False)
        ''' why don't we use Search.get_mapping ? I guess it's not a class method'''
        #logger.debug(response.json())
        if "error" in response.json():
            logger.warn(response.json())
            return None

        # get idx_types from _mapping
        elastic_mapping = json.loads(response.content.decode("utf-8"))
        # here if we use aliasing then idx can be different
        # this causes problems as it's effectively hardcoded
        # this should fix to handle things where aliases are deployed
        idx = list(elastic_mapping.keys())[0]
        idx_types = list(elastic_mapping[idx]['mappings'].keys())

        if elastic_dict is None:
            elastic_dict = ElasticSettings.attrs().get('IDX')

        idx_type_dict = {}

        existing_ct = [
            ct.name for ct in ContentType.objects.filter(
                app_label=cls.PERMISSION_MODEL_APP_NAME)
        ]

        for idx_type in idx_types:

            idx_type_with_suffix = idx_type + cls.PERMISSION_MODEL_TYPE_SUFFIX

            for ct in existing_ct:
                if ct.endswith(idx_type_with_suffix):

                    meta_url = idx + '/' + idx_type + '/_meta/_source'
                    meta_response = Search.elastic_request(elastic_url,
                                                           meta_url,
                                                           is_post=False)

                    try:
                        elastic_meta = json.loads(
                            meta_response.content.decode("utf-8"))
                        label = elastic_meta['label']
                    except:
                        label = "UD-" + idx_type

                    idx_type_dict['UD-' + idx_type.upper()] = {
                        'label': label,
                        'type': idx_type
                    }

        if new_upload_file is not None:
            idx_type = new_upload_file
            label = "UD-" + idx_type
            idx_type_dict['UD-' + idx_type.upper()] = {
                'label': label,
                'type': idx_type
            }

        elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict
        return elastic_dict
Пример #10
0
    def filter_queryset(self, request, queryset, view):
        ''' Get disease regions. '''
        try:
            filterable = getattr(view, 'filter_fields', [])
            filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])
            dis = filters.get('disease', 'T1D')
            show_genes = filters.get('genes', False)
            show_markers = filters.get('markers', False)
            show_regions = filters.get('regions', True)

            build = self._get_build(filters.get('build', settings.DEFAULT_BUILD))
            docs = DiseaseLocusDocument.get_disease_loci_docs(dis)
            if len(docs) == 0:
                messages.error(request, 'No regions found for '+dis+'.')

            visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')])
            regions = []
            all_markers = []
            all_genes = []
            ens_all_cand_genes = []
            for r in docs:
                region = r.get_disease_region(visible_hits, build=build)
                if region is not None:
                    ens_all_cand_genes.extend(region['ens_cand_genes'])
                    all_markers.extend(region['markers'])
                    region['hits'] = [self._study_hit_obj(s, region) for s in
                                      StudyHitDocument.process_hits(r.hit_docs, region['all_diseases'])]

                    (all_coding, all_non_coding) = views.get_genes_for_region(getattr(r, "seqid"),
                                                                              region['rstart']-500000,
                                                                              region['rstop']+500000)
                    (region_coding, coding_up, coding_down) = views._region_up_down(all_coding, region['rstart'],
                                                                                    region['rstop'])
                    (region_non_coding, non_coding_up, non_coding_down) = \
                        views._region_up_down(all_non_coding, region['rstart'], region['rstop'])
                    region['genes'] = {
                        'upstream': {'coding': [g.doc_id() for g in coding_up],
                                     'non_coding': [g.doc_id() for g in non_coding_up]},
                        'region': {'coding': [g.doc_id() for g in region_coding],
                                   'non_coding': [g.doc_id() for g in region_non_coding]},
                        'downstream': {'coding': [g.doc_id() for g in coding_down],
                                       'non_coding': [g.doc_id() for g in non_coding_down]},
                    }
                    all_genes.extend(region['genes']['region']['coding'])
                    all_genes.extend(region['genes']['region']['non_coding'])
                    regions.append(region)

            # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits
            stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers),
                                                Filter(RangeQuery("p_value", lte=5E-08)))
            stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs
            meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping',
                                                   is_post=False)
            # get ensembl to gene symbol mapping for all candidate genes
            extra_markers = []
            for region in regions:
                # add diseases from IC/GWAS stats
                (study_ids, region['marker_stats']) = views._process_stats(stats_docs, region['markers'], meta_response)
                region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']])

                other_hits_query = ElasticQuery(
                        BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])],
                                  must_not_arr=[Query.terms("dil_study_id", study_ids)]))
                other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'),
                                    size=100).search()
                region['extra_markers'] = [self._study_hit_obj(s, region) for s in
                                           StudyHitDocument.process_hits(other_hits.docs, region['all_diseases'])]
                region['all_diseases'] = list(set(region['all_diseases']))
                extra_markers.extend([m['marker_id'] for m in region['extra_markers']])

            # get markers
            marker_objs = []
            if show_markers:
                query = ElasticQuery(Query.terms("id", all_markers), sources=['id', 'start'])
                marker_docs = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'),
                                     size=len(all_markers)).search().docs
                mids = {getattr(m, 'id'): getattr(m, 'start') for m in marker_docs}
                marker_objs = [h for r in regions for h in r['hits']]
                marker_objs.extend([h for r in regions for h in r['extra_markers']])
                for m in marker_objs:
                    m['start'] = mids[m['marker_id']]

            # get genes
            gene_objs = []
            if show_genes:
                all_genes.extend(ens_all_cand_genes)
                gene_docs = GeneDocument.get_genes(all_genes, sources=['start', 'stop', 'chromosome',
                                                                       'symbol', 'biotype'])
                for doc in Document.sorted_alphanum(gene_docs, 'chromosome'):
                    ensembl_id = doc.doc_id()
                    region_name = ''
                    candidate_gene = 0
                    for region in regions:
                        if ('genes' in region and
                            (ensembl_id in region['genes']['region']['coding'] or
                             ensembl_id in region['genes']['region']['non_coding'] or
                             ensembl_id in region['ens_cand_genes'])):
                            region_name = region['region_name']
                            candidate_gene = 1 if ensembl_id in region['ens_cand_genes'] else 0
                            break
                    gene_objs.append({
                        'ensembl_id': ensembl_id,
                        'seqid': 'chr'+getattr(doc, 'chromosome'),
                        'start': getattr(doc, 'start'),
                        'end': getattr(doc, 'stop'),
                        'symbol': getattr(doc, 'symbol'),
                        'biotype': getattr(doc, 'biotype'),
                        'region_name': region_name,
                        'candidate_gene': candidate_gene
                    })
            if show_regions == 'false':
                regions = []
            regions.extend(gene_objs)
            regions.extend(marker_objs)
            return regions
        except (TypeError, ValueError, IndexError, ConnectionError) as e:
            print(e)
            raise Http404
Пример #11
0
    def get_regions(cls, request, dis, context):
        # is_authenticated = False
        elastic_url = ElasticSettings.url()

        (core, other) = Disease.get_site_diseases(dis_list=dis.upper().split(','))
        if len(core) == 0 and len(other) == 0:
            messages.error(request, 'Disease '+dis+' not found.')
            raise Http404()

        disease = core[0] if len(core) > 0 else other[0]
        context['title'] = getattr(disease, "name")+" Regions"

        docs = DiseaseLocusDocument.get_disease_loci_docs(dis)
        if len(docs) == 0:
            messages.error(request, 'No regions found for '+dis+'.')
            raise Http404()

        visible_hits = DiseaseLocusDocument.get_hits([h for r in docs for h in getattr(r, 'hits')])
        meta_response = Search.elastic_request(elastic_url, ElasticSettings.idx("IC_STATS") + '/_mapping',
                                               is_post=False)
        regions = []
        ens_all_cand_genes = []
        all_markers = []
        for r in docs:
            region = r.get_disease_region(visible_hits)
            if region is not None:
                ens_all_cand_genes.extend(region['ens_cand_genes'])
                all_markers.extend(region['markers'])
                region['hits'] = StudyHitDocument.process_hits(r.hit_docs, region['all_diseases'])

                (all_coding, all_non_coding) = get_genes_for_region(getattr(r, "seqid"),
                                                                    region['rstart']-500000, region['rstop']+500000)
                (region_coding, coding_up, coding_down) = _region_up_down(all_coding, region['rstart'], region['rstop'])
                (region_non_coding, non_coding_up, non_coding_down) = \
                    _region_up_down(all_non_coding, region['rstart'], region['rstop'])
                region['genes'] = {
                    'upstream': {'coding': coding_up, 'non_coding': non_coding_up},
                    'region': {'coding': region_coding, 'non_coding': region_non_coding},
                    'downstream': {'coding': coding_down, 'non_coding': non_coding_down},
                }
                regions.append(region)

        # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits
        stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers),
                                            Filter(RangeQuery("p_value", lte=5E-08)))
        stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"), size=len(all_markers)).search().docs

        # get ensembl to gene symbol mapping for all candidate genes
        all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes)
        for region in regions:
            region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)}
            (study_ids, region['marker_stats']) = _process_stats(stats_docs, region['markers'], meta_response)

            # add diseases from IC/GWAS stats
            region['all_diseases'].extend([getattr(mstat, 'disease') for mstat in region['marker_stats']])

            other_hits_query = ElasticQuery(
                        BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", region['markers'])],
                                  must_not_arr=[Query.terms("dil_study_id", study_ids)]))
            other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=100).search()
            region['extra_markers'] = StudyHitDocument.process_hits(other_hits.docs, region['all_diseases'])

        context['regions'] = regions
        context['disease_code'] = [dis]
        context['disease'] = getattr(disease, "name")
        return context
Пример #12
0
    def get_disease(cls, request, disease, context):
        disease = disease.lower()
        if disease is None:
            messages.error(request, 'No disease given.')
            raise Http404()
        query = ElasticQuery(Query.terms("code", [disease.split(',')]))
        elastic = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE'), size=5)
        res = elastic.search()
        if res.hits_total == 0:
            messages.error(request, 'Disease(s) '+disease+' not found.')
        elif res.hits_total < 9:
            disease_docs = res.docs
            names = ', '.join([getattr(doc, 'name') for doc in disease_docs])

            meta_response = Search.elastic_request(ElasticSettings.url(), ElasticSettings.idx("IC_STATS") + '/_mapping',
                                                   is_post=False)
            elastic_meta = json.loads(meta_response.content.decode("utf-8"))
            disease_docs = res.docs
            for dis in disease_docs:
                dis_code = getattr(dis, 'code').upper()
                docs = DiseaseLocusDocument.get_disease_loci_docs(dis_code)
                regions = []
                ens_all_cand_genes = []
                all_markers = []
                for r in docs:
                    region = r.get_disease_region()
                    if region is not None:
                        regions.append(region)
                        ens_all_cand_genes.extend(region['ens_cand_genes'])
                        all_markers.extend(region['markers'])

                # get ensembl to gene symbol mapping for all candidate genes
                all_cand_genes = gene.utils.get_gene_docs_by_ensembl_id(ens_all_cand_genes)
                for region in regions:
                    region['cand_genes'] = {cg: all_cand_genes[cg] for cg in region.pop("ens_cand_genes", None)}
                setattr(dis, 'regions', regions)

                # look for pleiotropy by looking for diseases for the markers in IC_STATS and other study hits
                stats_query = ElasticQuery.filtered(Query.terms("marker", all_markers),
                                                    Filter(RangeQuery("p_value", lte=5E-08)), sources=['marker'])
                stats_docs = Search(stats_query, idx=ElasticSettings.idx("IC_STATS"),
                                    size=len(all_markers)).search().docs

                other_hits_query = ElasticQuery(
                        BoolQuery(must_arr=[RangeQuery("tier", lte=2), Query.terms("marker", all_markers)]),
                        sources=['marker', 'disease'])
                other_hits = Search(other_hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'),
                                    size=5000).search().docs

                for region in regions:
                    diseases = [dis_code]
                    for doc in stats_docs:
                        if getattr(doc, 'marker') in region['markers']:
                            meta_info = elastic_meta[doc.index()]['mappings'][doc.type()]['_meta']
                            if meta_info['disease'] not in diseases:
                                diseases.append(meta_info['disease'])

                    for doc in other_hits:
                        if getattr(doc, 'marker') in region['markers']:
                            if doc.disease is not None and doc.disease not in diseases:
                                diseases.append(doc.disease)
                    region['diseases'] = diseases

                studies = StudyDocument.get_studies(disease_code=dis_code)
                for doc in studies:
                    setattr(doc, 'study_id', getattr(doc, 'study_id').replace('GDXHsS00', ''))
                    pmid = getattr(doc, 'principal_paper')
                    pubs = PublicationDocument.get_publications(pmid, sources=['date', 'authors.name', 'journal'])
                    if len(pubs) > 0:
                        authors = getattr(pubs[0], 'authors')
                        setattr(doc, 'date', getattr(pubs[0], 'date'))
                        setattr(doc, 'journal', getattr(pubs[0], 'journal'))
                        setattr(doc, 'author', authors[0]['name'].rsplit(None, 1)[-1] if authors else "")
                setattr(dis, 'studies',  studies)

            context['features'] = disease_docs
            context['title'] = names
            return context
        raise Http404()
Пример #13
0
    def process_criteria(cls, feature, section, config, sub_class, test=False):
        ''' Top level function that calls the right criteria implementation based on the subclass passed. Iterates over all the
            documents using the ScanAndScroll and the hits are processed by the inner function process_hits.
            The entire result is stored in result_container (a dict), and at the end of the processing, the result is
            loaded in to the elastic index after creating the mapping
        @type  feature: string
        @param feature: feature type, could be 'gene','region', 'marker' etc.,
        @type  section: string
        @keyword section: The section in the criteria.ini file
        @type  config:  string
        @keyword config: The config object initialized from criteria.ini.
        @type  sub_class: string
        @param sub_class: The name of the inherited sub_class where the actual implementation is
        '''
        global gl_result_container
        gl_result_container = {}
        test_mode = test
        if config is None:
            if test_mode:
                config = CriteriaManager().get_criteria_config(ini_file='test_criteria.ini')
            else:
                config = CriteriaManager().get_criteria_config(ini_file='criteria.ini')

        section_config = config[section]
        source_idx = section_config['source_idx']

        if ',' in source_idx:
            idxs = source_idx.split(',')
            idx_all = [ElasticSettings.idx(idx) for idx in idxs]
            source_idx = ','.join(idx_all)
        else:
            source_idx = ElasticSettings.idx(section_config['source_idx'])

        source_idx_type = None
        if 'source_idx_type' in section_config:
            source_idx_type = section_config['source_idx_type']

        if source_idx_type is not None:
            source_idx = ElasticSettings.idx(section_config['source_idx'], idx_type=section_config['source_idx_type'])
        else:
            source_idx_type = ''

        logger.warning(source_idx + ' ' + source_idx_type)

        def process_hits(resp_json):
            global gl_result_container
            hits = resp_json['hits']['hits']
            global hit_counter
            for hit in hits:
                hit_counter = hit_counter + 1

                result_container = sub_class.tag_feature_to_disease(hit, section, config,
                                                                    result_container=gl_result_container)
                gl_result_container = result_container

                if test_mode:
                    if gl_result_container is not None and len(gl_result_container) > 5:
                        return

        query = cls.get_elastic_query(section, config)

        if test_mode:
            result_size = len(gl_result_container)
            from_ = 0
            size_ = 20
            while (result_size < 1):
                from_ = from_ + size_
                url = ElasticSettings.url()
                if 'mhc' in section:
                    url_search = (source_idx + '/_search')
                else:
                    url_search = (source_idx + '/_search?from=' + str(from_) + '&size=' + str(size_))

                if query is None:
                    query = {
                              "query": {"match_all": {}},
                              "size":  20
                              }
                    response = Search.elastic_request(url, url_search, data=json.dumps(query))
                    query = None
                else:
                    # print(query)
                    response = Search.elastic_request(url, url_search, data=json.dumps(query.query))

                process_hits(response.json())
                if gl_result_container is not None:
                    result_size = len(gl_result_container)
        else:
            ScanAndScroll.scan_and_scroll(source_idx, call_fun=process_hits, query=query)

        cls.map_and_load(feature, section, config, gl_result_container)
Пример #14
0
    def marker_is_gwas_significant_in_ic(cls, hit, section=None, config=None, result_container={}):
        """
        /hg38_gwas_statistics,hg38_ic_statistics/_search?pretty' -d '{"query":{"range":{"p_value":{"lt": 0.00000005}}}}'
        """

        gw_sig_p = 0.00000005
        feature_doc = hit["_source"]
        feature_doc["_id"] = hit["_id"]

        idx = hit["_index"]
        idx_type = hit["_type"]

        # get meta data
        # studyid and diseaes
        elastic_url = ElasticSettings.url()
        meta_url = idx + "/" + idx_type + "/_mapping"
        meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False)

        try:
            elastic_meta = json.loads(meta_response.content.decode("utf-8"))
            meta_info = elastic_meta[idx]["mappings"][idx_type]["_meta"]
            disease = meta_info["disease"]
            dil_study_id = meta_info["study"]
        except:
            disease = None
            dil_study_id = None

        marker = None
        if "marker" in feature_doc:
            marker = feature_doc["marker"]

        if marker is None or disease is None:
            return result_container

        p_val = feature_doc["p_value"]
        if p_val is None:
            return result_container
        global counter
        counter = counter + 1

        p_val_to_compare = float(p_val)
        if p_val_to_compare < gw_sig_p:
            if dil_study_id is None or dil_study_id == "None":
                first_author = "NA"
                dil_study_id = "NA"
            else:
                query = ElasticQuery(Query.ids([dil_study_id]))
                elastic = Search(search_query=query, idx=ElasticSettings.idx("STUDY", "STUDY"), size=1)
                study_doc = elastic.search().docs[0]
                author = getattr(study_doc, "authors")[0]
                first_author = author["name"] + " " + author["initials"]

            fnotes = {
                "linkdata": "pval",
                "linkvalue": p_val_to_compare,
                "linkid": dil_study_id,
                "linkname": first_author,
            }
            result_container_populated = cls.populate_container(
                dil_study_id,
                first_author,
                fnotes=fnotes,
                features=[marker],
                diseases=[disease],
                result_container=result_container,
            )
            return result_container_populated
        else:
            return result_container
Пример #15
0
    def add_study_data(self, **options):
        ''' add gwas stats from a study '''
        study = options['study_id']
        file = options['addStudyData']
        message = ""
        print("Deleting study hits for " + study)
        Delete.docs_by_query(ElasticSettings.idx('REGION', 'STUDY_HITS'),
                             query=Query.term("dil_study_id", study))

        with open(file, newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for row in reader:
                if row[0] == 'Marker':
                    continue
                # 0 - Marker
                # 1 - disease
                # 2 - Chromosome
                # 3 - Region Start
                # 4 - Region End
                # 5 - Position
                # 6 - Strand
                # 7 - Major Allele
                # 8 - Minor allele
                # 9 - Minor allele frequency
                # 10 - Discovery P value
                # 11 - Discovery Odds ratio
                # 12 - Discovery 95% confidence interval lower limit
                # 13 - Discovery 95% confidence interval upper limit
                # 14 - Replication P value
                # 15 - Replication Odds ratio
                # 16 - Replication 95% confidence interval lower limit
                # 17 - Replication 95% confidence interval upper limit
                # 18 - Combined P value
                # 19 - Combined Odds ratio
                # 20 - Combined 95% confidence interval lower limit
                # 21 - Combined 95% confidence interval upper limit
                # 22 - PP Colocalisation
                # 23 - Gene
                # 24 - PubMed ID
                # 25 - Other Signal
                # 26 - Notes
                # 27 - Curation status/ failed quality control

                query = ElasticQuery(Query.match("id", row[0]))
                result = Search(search_query=query,
                                idx=ElasticSettings.idx('MARKER',
                                                        'MARKER')).search()
                if result.hits_total == 0:
                    result2 = Search(search_query=ElasticQuery(
                        Query.match("rshigh", row[0])),
                                     idx=ElasticSettings.idx(
                                         'MARKER', 'HISTORY')).search()
                    if result2.hits_total > 0:
                        history_doc = result2.docs[0]
                        new_id = getattr(history_doc, "rscurrent")
                        query = ElasticQuery(Query.match("id", new_id))
                        result = Search(search_query=query,
                                        idx=ElasticSettings.idx(
                                            'MARKER', 'MARKER')).search()

                if result.hits_total != 1:
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Marker cannot be found; <br />\n"

                marker = result.docs[0]

                query = ElasticQuery(Query.match("code", row[1]))
                result = Search(query,
                                idx=ElasticSettings.idx('DISEASE',
                                                        'DISEASE')).search()
                if result.hits_total != 1:
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Disease cannot be found; <br />\n"
                    continue
                disease = result.docs[0]

                if not re.match(r"^\w$", row[7]):
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Major allele is not set; <br />\n"
                    continue
                if not re.match(r"^\w$", row[8]):
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Minor allele is not set; <br />\n"
                    continue
                if float(row[9]) > 0.5:
                    message += "WARNING - MAF for " + row[
                        0] + " is >0.5; <br />\n"

                strand = row[6]
                if re.match(r"\d", strand):
                    strand = '+' if strand > 0 else '-'
                row[6] = strand

                if not re.match(r"\d+", row[2]):
                    row[2] = getattr(marker, "seqid")
                if not re.match(r"\d+", row[5]):
                    row[5] = getattr(marker, "start")
                if not row[5] == getattr(marker, "start"):
                    row[5] = getattr(marker, "start")

                data = {
                    "chr_band": self._get_chr_band(row[2], row[5]),
                    "other_signal": row[25],
                    "species": "Human",
                    "disease": getattr(disease, "code"),
                    "notes": row[26],
                    "disease_locus": "TBC",
                    "dil_study_id": study,
                    "marker": getattr(marker, "id"),
                    "status": "N",
                    "pp_probability": row[22],
                    "tier": 100,
                    "pmid": row[24],
                    "genes": self._get_ens_gene(row[23])
                }

                build_info = self._get_current_build_info(row[2], row[5])
                data['build_info'] = [build_info]

                data['p_values'] = {
                    'discovery': row[10],
                    'replication': row[14],
                    'combined': row[18]
                }

                data['odds_ratios'] = {
                    'discovery': {
                        "or": row[11],
                        "lower": row[12],
                        "upper": row[13]
                    },
                    'replication': {
                        "or": row[15],
                        "lower": row[16],
                        "upper": row[17]
                    },
                    'combined': {
                        "or": row[19],
                        "lower": row[20],
                        "upper": row[21]
                    }
                }

                data['alleles'] = {
                    'major': row[7],
                    'minor': row[8],
                    'maf': row[9]
                }

                data['suggest'] = {'input': [], 'weight': 1}

                r = Search.elastic_request(
                    ElasticSettings.url(),
                    ElasticSettings.idx('REGION', 'STUDY_HITS'),
                    json.dumps(data))
                if r.status_code != 201:
                    message += "ERROR loading row of gwas data for " + row[
                        0] + " - Failed to create document; <br />\n"

        print("\n\n" + message)