Python ElasticUtils 예제들, elastic.utils.ElasticUtils Python 예제들

예제 #1

0

파일 보기

파일: test_utils.py 프로젝트: ollyburren/django-elastic

 def test_search_count(self):
     ''' Test index and search counts. '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     count1 = ElasticUtils.get_docs_count(idx, idx_type)
     self.assertGreater(count1, 0, 'index count')
     search_query = ElasticQuery(
         BoolQuery(must_not_arr=[Query.term('seqid', 'chr1')]))
     count2 = ElasticUtils.get_docs_count(idx,
                                          idx_type,
                                          search_query=search_query)
     self.assertGreater(count1, count2, 'search query count')

예제 #2

0

파일 보기

파일: test_gene_interactions.py 프로젝트: D-I-L/django-data-pipeline

    def get_interaction_doc(self, interaction_source='intact', parent_id=None):
        '''Fetch random and specific genes from elastic'''
        idx_key = 'GENE'
        idx_type_key = 'INTERACTIONS'
        parent_idx_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        if parent_id:
            qbool_intact = BoolQuery().must([Query.term("interaction_source", interaction_source),
                                            Query.term("_parent", parent_id)])
        else:
            qbool_intact = BoolQuery().should([Query.term("interaction_source", interaction_source)])

        # Get random doc or specific if id is passed in query
        docs_by_geneid = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=qbool_intact, sources=[], size=1)
        doc = docs_by_geneid[0]

        # Get parent doc
        parent_id = doc.parent()
        logger.debug('parent_id : ' + parent_id)
        parent_docs = DataIntegrityUtils.fetch_from_elastic(idx_key, parent_idx_key, [parent_id])

        if parent_docs:
            self.assertTrue(len(parent_docs) >= 1, "Found 1 parent")
            parent_doc = parent_docs[0]
            return doc, parent_doc
        else:
            return self.get_interaction_doc("intact", parent_id)

예제 #3

0

파일 보기

파일: test_utils.py 프로젝트: D-I-L/django-criteria

 def test_criteria_types(self, idx, idx_types, criterias_from_config):
     '''check if the following criterias are there'''
     for criteria in criterias_from_config:
         self.assertIn(criteria, idx_types)
         doc_count = ElasticUtils.get_docs_count(idx, criteria)
         print(doc_count)
         self.assertGreater(doc_count, 200, 'Criteria doc count greater than 200')

예제 #4

0

파일 보기

파일: test_ld_rest.py 프로젝트: tcarver/django-data-pipeline

 def _get_random_marker(self):
     ''' Get a random marker from the dbSNP elastic index. '''
     (idx, idx_type) = ElasticSettings.idx('MARKER', 'MARKER').split('/')
     seqid = random.randint(1, 10)
     qbool = BoolQuery(must_arr=[Query.term("seqid", seqid), RangeQuery("tags.weight", gte=80)])
     doc = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=qbool, sources=['id', 'start'], size=1)[0]
     return getattr(doc, 'id')

예제 #5

0

파일 보기

파일: test_utils.py 프로젝트: ollyburren/django-elastic

 def test_get_rdm_docs(self):
     ''' Test get random document(s). '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     docs = ElasticUtils.get_rdm_docs(idx, idx_type)
     self.assertEqual(len(docs), 1, 'Retrieved one document')
     self.assertTrue(isinstance(docs[0], Document), 'Document type')

예제 #6

0

파일 보기

파일: test_marker_criteria.py 프로젝트: D-I-L/django-criteria

    def test_get_criteria_details(self):
        config = IniParser().read_ini(MY_INI_FILE)
        idx = ElasticSettings.idx('MARKER_CRITERIA')
        available_criterias = MarkerCriteria.get_available_criterias(config=config)['marker']
        idx_type = ','.join(available_criterias)

        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1)
        feature_id = getattr(doc_by_idx_type[0], 'qid')

        criteria_details = MarkerCriteria.get_criteria_details(feature_id, config=config)

        hits = criteria_details['hits']
        first_hit = hits[0]
        _type = first_hit['_type']
        _index = first_hit['_index']
        _id = first_hit['_id']
        _source = first_hit['_source']

        disease_tag = _source['disease_tags'][0]
        self.assertTrue(feature_id, _id)
        self.assertIn(_type, idx_type)
        self.assertTrue(idx, _index)
        self.assertIn(disease_tag, list(_source.keys()))

        fdetails = _source[disease_tag][0]
        self.assertIn('fid', fdetails.keys())
        self.assertIn('fname', fdetails.keys())

예제 #7

0

파일 보기

파일: test_regions.py 프로젝트: D-I-L/django-data-pipeline

    def test_region_attributes(self):
        ''' test region attributes '''
        idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, 'REGION')
        (idx, idx_type) = idx.split('/')
        docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
        newRegion = utils.Region.pad_region_doc(docs[0])

        if len(getattr(newRegion, "genes")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "genes")))
            resultObject = Search(query, idx=ElasticSettings.idx('GENE', 'GENE'),
                                  size=len(getattr(newRegion, "genes"))).search()
            self.assertEqual(len(getattr(newRegion, "genes")), resultObject.hits_total,
                             "All genes on region found in GENE index")

        if len(getattr(newRegion, "studies")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "studies")))
            resultObject = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'),
                                  size=len(getattr(newRegion, "studies"))).search()
            self.assertEqual(len(getattr(newRegion, "studies")), resultObject.hits_total,
                             "All study ids for region found in STUDY index")

        if len(getattr(newRegion, "pmids")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "pmids")))
            resultObject = Search(query, idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'),
                                  size=len(getattr(newRegion, "pmids"))).search()
            self.assertEqual(len(getattr(newRegion, "pmids")), resultObject.hits_total,
                             "All PMIDs for region found in PUBLICATION index")

예제 #8

0

파일 보기

파일: test_gene_criteria_data.py 프로젝트: D-I-L/django-criteria

    def test_gene_criteria_types(self):
        """Test if the indexes have records"""
        idx_key = "GENE_CRITERIA"
        feature_type = "gene"
        idx = ElasticSettings.idx(idx_key)

        idx_types = CriteriaDataIntegrityUtils.get_criteria_index_types(idx_key)
        gene_criterias = Criteria.get_available_criterias(feature_type)

        CriteriaDataIntegrityTestUtils().test_criteria_types(idx, idx_types, gene_criterias["gene"])
        CriteriaDataIntegrityTestUtils().test_criteria_mappings(idx, idx_types)

        # get random doc for each type ['gene_in_region', 'cand_gene_in_region', 'cand_gene_in_study', 'is_gene_in_mhc']
        idx_type = "gene_in_region"
        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1, "got back one document")
        gene_in_region_doc = doc_by_idx_type[0]

        #         {'score': 10, 'CRO': [{'fname': '4p11', 'fid': '4p11_005'}],
        #          '_meta': {'_type': 'gene_in_region', '_score': 0.9997835,
        #                    '_index': 'pydgin_imb_criteria_gene', '_id': 'ENSG00000250753'},
        #          'disease_tags': ['CRO'], 'qid': 'ENSG00000250753'}

        qid = getattr(gene_in_region_doc, "qid")
        print(qid)
        disease_tags = getattr(gene_in_region_doc, "disease_tags")
        #         ENSG00000248482
        #         ['IBD', 'UC']
        #         [{'fid': '5q31.1_013', 'fname': '5q31.1'}]
        #         [{'fid': '5q31.1_013', 'fname': '5q31.1'}]
        fnotes = getattr(gene_in_region_doc, disease_tags[0])
        region_id = fnotes[0]["fid"]
        print(region_id)

예제 #9

0

파일 보기

파일: test_gene_interactions.py 프로젝트: D-I-L/django-data-pipeline

    def test_gene_interactions(self):
        '''Fetch random genes from elastic and compare the same with the results fetched directly from intact'''
        # elastic doc example:
        # "_source":{"interaction_source": "intact", "interactors": [
        # {"interactor": "ENSG00000206053", "pubmed": "16169070"},
        # {"interactor": "ENSG00000101474", "pubmed": "16169070"},
        # {"interactor": "ENSG00000065361", "pubmed": "16169070"},
        # {"interactor": "ENSG00000085465", "pubmed": "16169070"}]}

        idx_key = 'GENE'
        idx_type_key = 'INTERACTIONS'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        # Test doc count
        doc_count = ElasticUtils.get_docs_count(idx, idx_type)
        self.assertGreater(doc_count, 23000, 'Gene doc count greater than 23000')

        # Get interaction doc - passing the interaction source and id . Also test with random id
        (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex", parent_id="ENSG00000164933")
        self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex)

        (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex")
        self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex)

        (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact", parent_id="ENSG00000188786")
        self.check_intact_data(child_doc_intact, parent_doc_intact)

        (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact")
        self.check_intact_data(child_doc_intact, parent_doc_intact)

예제 #10

0

파일 보기

파일: test_regions.py 프로젝트: D-I-L/django-data-pipeline

    def test_hit_attributes(self):
        '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query'''

        for idx_type_key in RegionDataTest.IDX_TYPE_KEYS:
            idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, idx_type_key)
            (idx, idx_type) = idx.split('/')

            docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)

예제 #11

0

파일 보기

파일: test_gene_criteria.py 프로젝트: D-I-L/django-criteria

 def get_random_feature_id(self):
     config = IniParser().read_ini(MY_INI_FILE)
     idx = ElasticSettings.idx('GENE_CRITERIA')
     available_criterias = GeneCriteria.get_available_criterias(config=config)['gene']
     idx_type = ','.join(available_criterias)
     doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
     self.assertTrue(len(doc_by_idx_type) > 0)
     feature_id = getattr(doc_by_idx_type[0], 'qid')
     return feature_id

예제 #12

0

파일 보기

파일: test_utils.py 프로젝트: ollyburren/django-elastic

    def test_get_rdm_feature_id(self):
        ''' Test get random feature id. '''
        idx = IDX['GFF_GENERIC']['indexName']
        idx_type = IDX['GFF_GENERIC']['indexType']
        doc_id = ElasticUtils.get_rdm_feature_id(idx, idx_type)

        self.assertTrue(isinstance(doc_id, str), 'Document id')
        docs = Search(ElasticQuery(Query.ids(doc_id)), idx=idx).search().docs
        self.assertTrue(len(docs) == 1, 'Document retrieved')

예제 #13

0

파일 보기

파일: test_regions.py 프로젝트: D-I-L/django-data-pipeline

    def test_docs_count(self):
        '''Check the number of docs in a given index/index-type'''

        for idx_type_key in RegionDataTest.IDX_TYPE_KEYS:
            idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, idx_type_key)
            (idx, idx_type) = idx.split('/')

            doc_count = ElasticUtils.get_docs_count(idx, idx_type)
            self.assertEqual(doc_count, RegionDataTest.DOC_COUNTS[idx_type_key],
                             "Count of docs in the "+idx_type_key+" index are correct")

예제 #14

0

파일 보기

파일: test_gene.py 프로젝트: D-I-L/django-data-pipeline

    def test_docs_count(self):
        '''Check the number of docs in a given index/index-type'''
        idx_key = 'GENE'
        idx_type_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        doc_count = ElasticUtils.get_docs_count(idx, idx_type)
        self.assertGreater(doc_count, 60000, 'Gene doc count greater than 60000')

예제 #15

0

파일 보기

파일: test_regions.py 프로젝트: tottlefields/pydgin

 def test_hit2region(self):
     ''' Test region returned for hit id. '''
     docs = ElasticUtils.get_rdm_docs(RegionTest.idx, RegionTest.idx_type,
                                      qbool=RangeQuery("tier", lte=2), sources=[], size=1)
     regions = utils.Region.hits_to_regions(docs)
     self.assertEqual(len(regions), 1)
     region_doc = regions[0]
     hit_doc = docs[0]
     self.assertEqual(getattr(hit_doc, "chr_band").lower(), getattr(region_doc, "region_name").lower())
     self.assertIn(getattr(hit_doc, "disease"), getattr(region_doc, "tags")['disease'],
                   getattr(hit_doc, "disease") + "exists in list of tagged diseases on parent region")

예제 #16

0

파일 보기

파일: test_gene.py 프로젝트: D-I-L/django-data-pipeline

    def test_gene_attributes(self):
        '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query'''
        idx_key = 'GENE'
        idx_type_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        docs_by_geneid = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)

        # "_source":{"symbol": "RP11-376M2.2", "start": 42975689, "biotype": "sense_intronic", "chromosome": "17",
        # "source": "havana", "strand": "-", "stop": 42977275}
        for doc in docs_by_geneid:
            gene_id_pipeline = doc.doc_id()
            index_pipeline = doc.index()
            start_pipeline = getattr(doc, "start")
            stop_pipeline = getattr(doc, "stop")
            chromosome_pipeline = getattr(doc, "chromosome")

            biotype_pipeline = getattr(doc, "biotype")
            strand_pipeline = getattr(doc, "strand")
            strand_pipeline = -1 if strand_pipeline == '-' else 1
            symbol_pipeline = getattr(doc, "symbol")
            source_pipeline = getattr(doc, "source")

            # genes_hg38_v0.0.2
            pattern = re.compile('genes_\w\w(\d+)', re.IGNORECASE)
            match = pattern.match(index_pipeline)
            assembly_number_pipeline = None
            if match:
                assembly_number_pipeline = match.group(1)

            ensembl_gene_data = DataIntegrityUtils.fetch_from_ensembl(gene_id_pipeline)

            if ensembl_gene_data:
                pattern = re.compile('GRCh(\d+)', re.IGNORECASE)
                match = pattern.match(ensembl_gene_data['assembly_name'])

                assembly_number_ens = None
                if match:
                    assembly_number_ens = match.group(1)

                self.assertEqual(assembly_number_pipeline, assembly_number_ens, "Assembly number is ok")
                self.assertEqual(gene_id_pipeline, ensembl_gene_data['id'], "Gene Id number is ok")
                self.assertEqual(start_pipeline, ensembl_gene_data['start'], "start is ok")
                self.assertEqual(stop_pipeline, ensembl_gene_data['end'], "stop is ok")
                self.assertEqual(chromosome_pipeline, ensembl_gene_data['seq_region_name'], "chr is ok")
                self.assertEqual(strand_pipeline, ensembl_gene_data['strand'], "strand is ok")

                self.assertEqual(biotype_pipeline, ensembl_gene_data['biotype'], "biotype is ok")
                self.assertEqual(symbol_pipeline, ensembl_gene_data['display_name'], "symbol/display_name is ok")
                self.assertEqual(source_pipeline, ensembl_gene_data['source'], "source is ok")
            else:
                logger.warn("No test run....no ensembl data via ensembl webservice")

예제 #17

0

파일 보기

파일: criteria.py 프로젝트: D-I-L/django-criteria

    def get_elastic_query(cls, section=None, config=None):
        ''' function to build the elastic query object
        @type  section: string
        @keyword section: The section in the criteria.ini file
        @type  config:  string
        @keyword config: The config object initialized from criteria.ini.
        @return: L{Query}
        '''
        section_config = config[section]
        source_fields = []

        if 'source_fields' in section_config:
            source_fields_str = section_config['source_fields']
            source_fields = source_fields_str.split(',')

        if 'mhc' in section:
            seqid = '6'
            start_range = 25000000
            end_range = 35000000

            seqid_param = section_config['seqid_param']
            start_param = section_config['start_param']
            end_param = section_config['end_param']

        if section == 'is_gene_in_mhc':
            # for region you should make a different query
            # Defined MHC region as chr6:25,000,000..35,000,000

            query = ElasticUtils.range_overlap_query(seqid, start_range, end_range,
                                                     source_fields,
                                                     seqid_param,
                                                     start_param,
                                                     end_param)
        elif section == 'is_marker_in_mhc':
            query_bool = BoolQuery()
            query_bool.must(RangeQuery("start", lte=end_range)) \
                      .must(RangeQuery("start", gte=start_range)) \
                      .must(Query.term("seqid", seqid))
            query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elif section == 'is_region_in_mhc':
            query = ElasticQuery(Query.term("region_name", "MHC"))
        elif section == 'marker_is_gwas_significant_in_ic':
            # build a range query
            gw_sig_p = 0.00000005
            query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p))
        else:
            if len(source_fields) > 0:
                query = ElasticQuery(Query.match_all(), sources=source_fields)
            else:
                # query = ElasticQuery(Query.match_all())
                return None

        return query

예제 #18

0

파일 보기

파일: test_marker_criteria.py 프로젝트: D-I-L/django-criteria

    def test_get_disease_tags(self):
        config = IniParser().read_ini(MY_INI_FILE)
        idx = ElasticSettings.idx('MARKER_CRITERIA')
        available_criterias = MarkerCriteria.get_available_criterias(config=config)['marker']
        idx_type = ','.join(available_criterias)
        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1)
        feature_id = getattr(doc_by_idx_type[0], 'qid')

        disease_docs = MarkerCriteria.get_disease_tags(feature_id)

        self.assertIsNotNone(disease_docs, 'got back result docs')
        disease_tags = [getattr(disease_doc, 'code') for disease_doc in disease_docs]
        self.assertIsNotNone(disease_tags, "got back disease tags")

예제 #19

0

파일 보기

파일: test_marker_criteria_data.py 프로젝트: D-I-L/django-criteria

    def test_marker_criteria_types(self):
        '''Test if the indexes have records'''
        idx_key = 'MARKER_CRITERIA'
        feature_type = 'marker'
        idx = ElasticSettings.idx(idx_key)

        idx_types = CriteriaDataIntegrityUtils.get_criteria_index_types(idx_key)
        gene_criterias = Criteria.get_available_criterias(feature_type)

        CriteriaDataIntegrityTestUtils().test_criteria_types(idx, idx_types, gene_criterias['gene'])
        CriteriaDataIntegrityTestUtils().test_criteria_mappings(idx, idx_types)

        # get random doc for each type ['gene_in_region', 'cand_gene_in_region', 'cand_gene_in_study', 'is_gene_in_mhc']
        idx_type = 'rsq_with_index_snp'
        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1, 'got back one document')

예제 #20

0

파일 보기

파일: test_regions.py 프로젝트: tottlefields/pydgin

    def test_pad_region(self):
        ''' Test the padding of a region based on it's disease_loci & hits. '''
        idx = ElasticSettings.idx(RegionTest.IDX_KEY, 'REGION')
        (idx, idx_type) = idx.split('/')
        docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)

        region = docs[0]
        self.assertFalse(getattr(region, "build_info"), "Region doesn't contain any positional details")
        self.assertFalse(getattr(region, "markers"), "Region doesn't contain any marker details")
        self.assertFalse(getattr(region, "hits"), "Region doesn't contain any HIT details")
        self.assertFalse(getattr(region, "genes"), "Region doesn't contain any gene details")
        self.assertFalse(getattr(region, "studies"), "Region doesn't contain any study details")
        self.assertFalse(getattr(region, "pmids"), "Region doesn't contain any publication details")

        newRegion = utils.Region.pad_region_doc(region)
        self.assertTrue(getattr(newRegion, "build_info"), "New region contains positional details")
        self.assertTrue(getattr(newRegion, "markers"), "New region contains marker details")
        self.assertGreaterEqual(len(getattr(newRegion, "markers")), 1, "New region contains at least 1 marker")
        self.assertTrue(getattr(newRegion, "hits"), "New region contains hit details")
        self.assertGreaterEqual(len(getattr(newRegion, "hits")), 1, "New region contains at least 1 HIT")

예제 #21

0

파일 보기

파일: test_recombination.py 프로젝트: D-I-L/django-data-pipeline

 def test_data_loaded(self):
     ''' Test cytobands type populated. '''
     (idx, idx_type) = ElasticSettings.idx('HAPMAP', idx_type='HAPMAP').split('/')
     self.assertGreater(ElasticUtils.get_docs_count(idx, idx_type), 3000000)

예제 #22

0

파일 보기

파일: test_utils.py 프로젝트: ollyburren/django-elastic

 def test_get_rdm_feature_ids(self):
     ''' Test get random feature ids. '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     ids = ElasticUtils.get_rdm_feature_ids(idx, idx_type, size=2)
     self.assertEqual(len(ids), 2, 'Retrieved one document')

예제 #23

0

파일 보기

파일: test_bands.py 프로젝트: D-I-L/django-data-pipeline

 def test_bands(self):
     ''' Test cytobands type populated. '''
     (idx, idx_type) = ElasticSettings.idx('BAND', idx_type='BAND').split('/')
     self.assertGreater(ElasticUtils.get_docs_count(idx, idx_type), 1200)