예제 #1
0
 def test_search_count(self):
     ''' Test index and search counts. '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     count1 = ElasticUtils.get_docs_count(idx, idx_type)
     self.assertGreater(count1, 0, 'index count')
     search_query = ElasticQuery(
         BoolQuery(must_not_arr=[Query.term('seqid', 'chr1')]))
     count2 = ElasticUtils.get_docs_count(idx,
                                          idx_type,
                                          search_query=search_query)
     self.assertGreater(count1, count2, 'search query count')
    def get_interaction_doc(self, interaction_source='intact', parent_id=None):
        '''Fetch random and specific genes from elastic'''
        idx_key = 'GENE'
        idx_type_key = 'INTERACTIONS'
        parent_idx_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        if parent_id:
            qbool_intact = BoolQuery().must([Query.term("interaction_source", interaction_source),
                                            Query.term("_parent", parent_id)])
        else:
            qbool_intact = BoolQuery().should([Query.term("interaction_source", interaction_source)])

        # Get random doc or specific if id is passed in query
        docs_by_geneid = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=qbool_intact, sources=[], size=1)
        doc = docs_by_geneid[0]

        # Get parent doc
        parent_id = doc.parent()
        logger.debug('parent_id : ' + parent_id)
        parent_docs = DataIntegrityUtils.fetch_from_elastic(idx_key, parent_idx_key, [parent_id])

        if parent_docs:
            self.assertTrue(len(parent_docs) >= 1, "Found 1 parent")
            parent_doc = parent_docs[0]
            return doc, parent_doc
        else:
            return self.get_interaction_doc("intact", parent_id)
예제 #3
0
 def test_criteria_types(self, idx, idx_types, criterias_from_config):
     '''check if the following criterias are there'''
     for criteria in criterias_from_config:
         self.assertIn(criteria, idx_types)
         doc_count = ElasticUtils.get_docs_count(idx, criteria)
         print(doc_count)
         self.assertGreater(doc_count, 200, 'Criteria doc count greater than 200')
예제 #4
0
 def _get_random_marker(self):
     ''' Get a random marker from the dbSNP elastic index. '''
     (idx, idx_type) = ElasticSettings.idx('MARKER', 'MARKER').split('/')
     seqid = random.randint(1, 10)
     qbool = BoolQuery(must_arr=[Query.term("seqid", seqid), RangeQuery("tags.weight", gte=80)])
     doc = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=qbool, sources=['id', 'start'], size=1)[0]
     return getattr(doc, 'id')
예제 #5
0
 def test_get_rdm_docs(self):
     ''' Test get random document(s). '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     docs = ElasticUtils.get_rdm_docs(idx, idx_type)
     self.assertEqual(len(docs), 1, 'Retrieved one document')
     self.assertTrue(isinstance(docs[0], Document), 'Document type')
예제 #6
0
    def test_get_criteria_details(self):
        config = IniParser().read_ini(MY_INI_FILE)
        idx = ElasticSettings.idx('MARKER_CRITERIA')
        available_criterias = MarkerCriteria.get_available_criterias(config=config)['marker']
        idx_type = ','.join(available_criterias)

        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1)
        feature_id = getattr(doc_by_idx_type[0], 'qid')

        criteria_details = MarkerCriteria.get_criteria_details(feature_id, config=config)

        hits = criteria_details['hits']
        first_hit = hits[0]
        _type = first_hit['_type']
        _index = first_hit['_index']
        _id = first_hit['_id']
        _source = first_hit['_source']

        disease_tag = _source['disease_tags'][0]
        self.assertTrue(feature_id, _id)
        self.assertIn(_type, idx_type)
        self.assertTrue(idx, _index)
        self.assertIn(disease_tag, list(_source.keys()))

        fdetails = _source[disease_tag][0]
        self.assertIn('fid', fdetails.keys())
        self.assertIn('fname', fdetails.keys())
예제 #7
0
    def test_region_attributes(self):
        ''' test region attributes '''
        idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, 'REGION')
        (idx, idx_type) = idx.split('/')
        docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
        newRegion = utils.Region.pad_region_doc(docs[0])

        if len(getattr(newRegion, "genes")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "genes")))
            resultObject = Search(query, idx=ElasticSettings.idx('GENE', 'GENE'),
                                  size=len(getattr(newRegion, "genes"))).search()
            self.assertEqual(len(getattr(newRegion, "genes")), resultObject.hits_total,
                             "All genes on region found in GENE index")

        if len(getattr(newRegion, "studies")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "studies")))
            resultObject = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'),
                                  size=len(getattr(newRegion, "studies"))).search()
            self.assertEqual(len(getattr(newRegion, "studies")), resultObject.hits_total,
                             "All study ids for region found in STUDY index")

        if len(getattr(newRegion, "pmids")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "pmids")))
            resultObject = Search(query, idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'),
                                  size=len(getattr(newRegion, "pmids"))).search()
            self.assertEqual(len(getattr(newRegion, "pmids")), resultObject.hits_total,
                             "All PMIDs for region found in PUBLICATION index")
    def test_gene_criteria_types(self):
        """Test if the indexes have records"""
        idx_key = "GENE_CRITERIA"
        feature_type = "gene"
        idx = ElasticSettings.idx(idx_key)

        idx_types = CriteriaDataIntegrityUtils.get_criteria_index_types(idx_key)
        gene_criterias = Criteria.get_available_criterias(feature_type)

        CriteriaDataIntegrityTestUtils().test_criteria_types(idx, idx_types, gene_criterias["gene"])
        CriteriaDataIntegrityTestUtils().test_criteria_mappings(idx, idx_types)

        # get random doc for each type ['gene_in_region', 'cand_gene_in_region', 'cand_gene_in_study', 'is_gene_in_mhc']
        idx_type = "gene_in_region"
        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1, "got back one document")
        gene_in_region_doc = doc_by_idx_type[0]

        #         {'score': 10, 'CRO': [{'fname': '4p11', 'fid': '4p11_005'}],
        #          '_meta': {'_type': 'gene_in_region', '_score': 0.9997835,
        #                    '_index': 'pydgin_imb_criteria_gene', '_id': 'ENSG00000250753'},
        #          'disease_tags': ['CRO'], 'qid': 'ENSG00000250753'}

        qid = getattr(gene_in_region_doc, "qid")
        print(qid)
        disease_tags = getattr(gene_in_region_doc, "disease_tags")
        #         ENSG00000248482
        #         ['IBD', 'UC']
        #         [{'fid': '5q31.1_013', 'fname': '5q31.1'}]
        #         [{'fid': '5q31.1_013', 'fname': '5q31.1'}]
        fnotes = getattr(gene_in_region_doc, disease_tags[0])
        region_id = fnotes[0]["fid"]
        print(region_id)
    def test_gene_interactions(self):
        '''Fetch random genes from elastic and compare the same with the results fetched directly from intact'''
        # elastic doc example:
        # "_source":{"interaction_source": "intact", "interactors": [
        # {"interactor": "ENSG00000206053", "pubmed": "16169070"},
        # {"interactor": "ENSG00000101474", "pubmed": "16169070"},
        # {"interactor": "ENSG00000065361", "pubmed": "16169070"},
        # {"interactor": "ENSG00000085465", "pubmed": "16169070"}]}

        idx_key = 'GENE'
        idx_type_key = 'INTERACTIONS'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        # Test doc count
        doc_count = ElasticUtils.get_docs_count(idx, idx_type)
        self.assertGreater(doc_count, 23000, 'Gene doc count greater than 23000')

        # Get interaction doc - passing the interaction source and id . Also test with random id
        (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex", parent_id="ENSG00000164933")
        self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex)

        (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex")
        self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex)

        (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact", parent_id="ENSG00000188786")
        self.check_intact_data(child_doc_intact, parent_doc_intact)

        (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact")
        self.check_intact_data(child_doc_intact, parent_doc_intact)
예제 #10
0
    def test_hit_attributes(self):
        '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query'''

        for idx_type_key in RegionDataTest.IDX_TYPE_KEYS:
            idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, idx_type_key)
            (idx, idx_type) = idx.split('/')

            docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
예제 #11
0
 def get_random_feature_id(self):
     config = IniParser().read_ini(MY_INI_FILE)
     idx = ElasticSettings.idx('GENE_CRITERIA')
     available_criterias = GeneCriteria.get_available_criterias(config=config)['gene']
     idx_type = ','.join(available_criterias)
     doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
     self.assertTrue(len(doc_by_idx_type) > 0)
     feature_id = getattr(doc_by_idx_type[0], 'qid')
     return feature_id
예제 #12
0
    def test_get_rdm_feature_id(self):
        ''' Test get random feature id. '''
        idx = IDX['GFF_GENERIC']['indexName']
        idx_type = IDX['GFF_GENERIC']['indexType']
        doc_id = ElasticUtils.get_rdm_feature_id(idx, idx_type)

        self.assertTrue(isinstance(doc_id, str), 'Document id')
        docs = Search(ElasticQuery(Query.ids(doc_id)), idx=idx).search().docs
        self.assertTrue(len(docs) == 1, 'Document retrieved')
예제 #13
0
    def test_docs_count(self):
        '''Check the number of docs in a given index/index-type'''

        for idx_type_key in RegionDataTest.IDX_TYPE_KEYS:
            idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, idx_type_key)
            (idx, idx_type) = idx.split('/')

            doc_count = ElasticUtils.get_docs_count(idx, idx_type)
            self.assertEqual(doc_count, RegionDataTest.DOC_COUNTS[idx_type_key],
                             "Count of docs in the "+idx_type_key+" index are correct")
예제 #14
0
    def test_docs_count(self):
        '''Check the number of docs in a given index/index-type'''
        idx_key = 'GENE'
        idx_type_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        doc_count = ElasticUtils.get_docs_count(idx, idx_type)
        self.assertGreater(doc_count, 60000, 'Gene doc count greater than 60000')
예제 #15
0
 def test_hit2region(self):
     ''' Test region returned for hit id. '''
     docs = ElasticUtils.get_rdm_docs(RegionTest.idx, RegionTest.idx_type,
                                      qbool=RangeQuery("tier", lte=2), sources=[], size=1)
     regions = utils.Region.hits_to_regions(docs)
     self.assertEqual(len(regions), 1)
     region_doc = regions[0]
     hit_doc = docs[0]
     self.assertEqual(getattr(hit_doc, "chr_band").lower(), getattr(region_doc, "region_name").lower())
     self.assertIn(getattr(hit_doc, "disease"), getattr(region_doc, "tags")['disease'],
                   getattr(hit_doc, "disease") + "exists in list of tagged diseases on parent region")
예제 #16
0
    def test_gene_attributes(self):
        '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query'''
        idx_key = 'GENE'
        idx_type_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        docs_by_geneid = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)

        # "_source":{"symbol": "RP11-376M2.2", "start": 42975689, "biotype": "sense_intronic", "chromosome": "17",
        # "source": "havana", "strand": "-", "stop": 42977275}
        for doc in docs_by_geneid:
            gene_id_pipeline = doc.doc_id()
            index_pipeline = doc.index()
            start_pipeline = getattr(doc, "start")
            stop_pipeline = getattr(doc, "stop")
            chromosome_pipeline = getattr(doc, "chromosome")

            biotype_pipeline = getattr(doc, "biotype")
            strand_pipeline = getattr(doc, "strand")
            strand_pipeline = -1 if strand_pipeline == '-' else 1
            symbol_pipeline = getattr(doc, "symbol")
            source_pipeline = getattr(doc, "source")

            # genes_hg38_v0.0.2
            pattern = re.compile('genes_\w\w(\d+)', re.IGNORECASE)
            match = pattern.match(index_pipeline)
            assembly_number_pipeline = None
            if match:
                assembly_number_pipeline = match.group(1)

            ensembl_gene_data = DataIntegrityUtils.fetch_from_ensembl(gene_id_pipeline)

            if ensembl_gene_data:
                pattern = re.compile('GRCh(\d+)', re.IGNORECASE)
                match = pattern.match(ensembl_gene_data['assembly_name'])

                assembly_number_ens = None
                if match:
                    assembly_number_ens = match.group(1)

                self.assertEqual(assembly_number_pipeline, assembly_number_ens, "Assembly number is ok")
                self.assertEqual(gene_id_pipeline, ensembl_gene_data['id'], "Gene Id number is ok")
                self.assertEqual(start_pipeline, ensembl_gene_data['start'], "start is ok")
                self.assertEqual(stop_pipeline, ensembl_gene_data['end'], "stop is ok")
                self.assertEqual(chromosome_pipeline, ensembl_gene_data['seq_region_name'], "chr is ok")
                self.assertEqual(strand_pipeline, ensembl_gene_data['strand'], "strand is ok")

                self.assertEqual(biotype_pipeline, ensembl_gene_data['biotype'], "biotype is ok")
                self.assertEqual(symbol_pipeline, ensembl_gene_data['display_name'], "symbol/display_name is ok")
                self.assertEqual(source_pipeline, ensembl_gene_data['source'], "source is ok")
            else:
                logger.warn("No test run....no ensembl data via ensembl webservice")
예제 #17
0
    def get_elastic_query(cls, section=None, config=None):
        ''' function to build the elastic query object
        @type  section: string
        @keyword section: The section in the criteria.ini file
        @type  config:  string
        @keyword config: The config object initialized from criteria.ini.
        @return: L{Query}
        '''
        section_config = config[section]
        source_fields = []

        if 'source_fields' in section_config:
            source_fields_str = section_config['source_fields']
            source_fields = source_fields_str.split(',')

        if 'mhc' in section:
            seqid = '6'
            start_range = 25000000
            end_range = 35000000

            seqid_param = section_config['seqid_param']
            start_param = section_config['start_param']
            end_param = section_config['end_param']

        if section == 'is_gene_in_mhc':
            # for region you should make a different query
            # Defined MHC region as chr6:25,000,000..35,000,000

            query = ElasticUtils.range_overlap_query(seqid, start_range, end_range,
                                                     source_fields,
                                                     seqid_param,
                                                     start_param,
                                                     end_param)
        elif section == 'is_marker_in_mhc':
            query_bool = BoolQuery()
            query_bool.must(RangeQuery("start", lte=end_range)) \
                      .must(RangeQuery("start", gte=start_range)) \
                      .must(Query.term("seqid", seqid))
            query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elif section == 'is_region_in_mhc':
            query = ElasticQuery(Query.term("region_name", "MHC"))
        elif section == 'marker_is_gwas_significant_in_ic':
            # build a range query
            gw_sig_p = 0.00000005
            query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p))
        else:
            if len(source_fields) > 0:
                query = ElasticQuery(Query.match_all(), sources=source_fields)
            else:
                # query = ElasticQuery(Query.match_all())
                return None

        return query
예제 #18
0
    def test_get_disease_tags(self):
        config = IniParser().read_ini(MY_INI_FILE)
        idx = ElasticSettings.idx('MARKER_CRITERIA')
        available_criterias = MarkerCriteria.get_available_criterias(config=config)['marker']
        idx_type = ','.join(available_criterias)
        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1)
        feature_id = getattr(doc_by_idx_type[0], 'qid')

        disease_docs = MarkerCriteria.get_disease_tags(feature_id)

        self.assertIsNotNone(disease_docs, 'got back result docs')
        disease_tags = [getattr(disease_doc, 'code') for disease_doc in disease_docs]
        self.assertIsNotNone(disease_tags, "got back disease tags")
    def test_marker_criteria_types(self):
        '''Test if the indexes have records'''
        idx_key = 'MARKER_CRITERIA'
        feature_type = 'marker'
        idx = ElasticSettings.idx(idx_key)

        idx_types = CriteriaDataIntegrityUtils.get_criteria_index_types(idx_key)
        gene_criterias = Criteria.get_available_criterias(feature_type)

        CriteriaDataIntegrityTestUtils().test_criteria_types(idx, idx_types, gene_criterias['gene'])
        CriteriaDataIntegrityTestUtils().test_criteria_mappings(idx, idx_types)

        # get random doc for each type ['gene_in_region', 'cand_gene_in_region', 'cand_gene_in_study', 'is_gene_in_mhc']
        idx_type = 'rsq_with_index_snp'
        doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1)
        self.assertTrue(len(doc_by_idx_type) == 1, 'got back one document')
예제 #20
0
    def test_pad_region(self):
        ''' Test the padding of a region based on it's disease_loci & hits. '''
        idx = ElasticSettings.idx(RegionTest.IDX_KEY, 'REGION')
        (idx, idx_type) = idx.split('/')
        docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)

        region = docs[0]
        self.assertFalse(getattr(region, "build_info"), "Region doesn't contain any positional details")
        self.assertFalse(getattr(region, "markers"), "Region doesn't contain any marker details")
        self.assertFalse(getattr(region, "hits"), "Region doesn't contain any HIT details")
        self.assertFalse(getattr(region, "genes"), "Region doesn't contain any gene details")
        self.assertFalse(getattr(region, "studies"), "Region doesn't contain any study details")
        self.assertFalse(getattr(region, "pmids"), "Region doesn't contain any publication details")

        newRegion = utils.Region.pad_region_doc(region)
        self.assertTrue(getattr(newRegion, "build_info"), "New region contains positional details")
        self.assertTrue(getattr(newRegion, "markers"), "New region contains marker details")
        self.assertGreaterEqual(len(getattr(newRegion, "markers")), 1, "New region contains at least 1 marker")
        self.assertTrue(getattr(newRegion, "hits"), "New region contains hit details")
        self.assertGreaterEqual(len(getattr(newRegion, "hits")), 1, "New region contains at least 1 HIT")
 def test_data_loaded(self):
     ''' Test cytobands type populated. '''
     (idx, idx_type) = ElasticSettings.idx('HAPMAP', idx_type='HAPMAP').split('/')
     self.assertGreater(ElasticUtils.get_docs_count(idx, idx_type), 3000000)
예제 #22
0
 def test_get_rdm_feature_ids(self):
     ''' Test get random feature ids. '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     ids = ElasticUtils.get_rdm_feature_ids(idx, idx_type, size=2)
     self.assertEqual(len(ids), 2, 'Retrieved one document')
예제 #23
0
 def test_bands(self):
     ''' Test cytobands type populated. '''
     (idx, idx_type) = ElasticSettings.idx('BAND', idx_type='BAND').split('/')
     self.assertGreater(ElasticUtils.get_docs_count(idx, idx_type), 1200)