Пример #1
0
    def test_bulk(self):
        ''' Test the Bulk.load(). '''
        self.set_up()
        idx = IDX['MARKER']['indexName']
        elastic = Search(ElasticQuery(Query.match_all()), idx=idx)
        hits_total1 = elastic.get_count()['count']

        json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \
                    (idx, 'marker')
        json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".",
                                 "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"})
        resp = Bulk.load(idx, '', json_data)
        self.assertNotEquals(resp.status_code, 200)

        # note: needs a trailing line return to work
        Bulk.load(idx, '', json_data + '\n')
        Search.index_refresh(idx)
        hits_total2 = elastic.get_count()['count']
        self.assertEquals(hits_total2, hits_total1+1, "contains documents")

        # produce errors updating doc id that doesn't exist
        json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"doc": {"start": 100, "end": 200}}\n'
        resp = Bulk.load(idx, '', json_data)
        self.assertTrue('errors' in resp.json() and resp.json()['errors'])
    def test_pubs_disease_tags(self):
        ''' Check the number of disease publications against the number of tags.disease and
        report differences`. '''
        count = True
        msg = ''
        for disease in DiseasePublicationTest.DISEASES:
            pmids = self._get_pmids(disease)
            disease_code = disease.lower()
            elastic = Search(search_query=ElasticQuery(BoolQuery(
                         b_filter=Filter(Query.term('tags.disease', disease_code))), sources=['pmid']),
                         idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)
            res = elastic.get_count()
            msg += disease_code+'\tINDEX: '+str(res['count'])+'\tNCBI: '+str(len(pmids))
            if res['count'] != len(pmids):
                count = False
                docs = elastic.search().docs
                pmids_in_idx = [getattr(doc, 'pmid') for doc in docs]
                pmids_diff1 = [pmid for pmid in pmids_in_idx if pmid not in pmids]
                pmids_diff2 = [pmid for pmid in pmids if pmid not in pmids_in_idx]
                if len(pmids_diff1) > 0:
                    msg += '\textra PMIDs: '+str(pmids_diff1)
                if len(pmids_diff2) > 0:
                    msg += '\tmissing PMIDs: '+str(pmids_diff2)
            msg += '\n'

        print(msg)
        self.assertTrue(count, 'Count for disease tags')
    def test_marker_pipeline(self):
        """ Test marker pipeline. """
        call_command("pipeline", "--steps", "load", sections="DBSNP", dir=TEST_DATA_DIR, ini=MY_INI_FILE)

        INI_CONFIG = IniParser().read_ini(MY_INI_FILE)
        idx = INI_CONFIG["DBSNP"]["index"]
        idx_type = INI_CONFIG["DBSNP"]["index_type"]
        elastic = Search(idx=idx, idx_type=idx_type)
        Search.index_refresh(idx)
        self.assertGreater(elastic.get_count()["count"], 0)

        call_command("pipeline", "--steps", "load", sections="RSMERGEARCH", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        idx = INI_CONFIG["RSMERGEARCH"]["index"]
        idx_type = INI_CONFIG["RSMERGEARCH"]["index_type"]
        elastic = Search(idx=idx, idx_type=idx_type)
        Search.index_refresh(idx)
        self.assertGreater(elastic.get_count()["count"], 0)
Пример #4
0
    def test_delete_docs_by_query(self):
        ''' Test deleting docs using a query. '''
        self.set_up()
        idx = IDX['MARKER']['indexName']
        elastic = Search(ElasticQuery(Query.match_all()), idx=idx)
        hits_total1 = elastic.get_count()['count']
        self.assertGreater(hits_total1, 0, "contains documents")

        # delete single doc
        Delete.docs_by_query(idx, query=Query.term("id", "rs2476601"))
        Search.index_refresh(idx)
        hits_total2 = elastic.get_count()['count']
        self.assertEquals(hits_total2, hits_total1-1, "contains documents")

        # delete remaining docs
        Delete.docs_by_query(idx, 'marker')
        Search.index_refresh(idx)
        self.assertEquals(elastic.get_count()['count'], 0, "contains no documents")
        def get_pmids(resp_json):
            pmids = []
            for hit in resp_json['hits']['hits']:
                doc = Document(hit)
                pmids.append(getattr(doc, "pmid"))

            pmids = list(set(pmids))
            elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))),
                                                       sources=['pmid']),
                             idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)

            if len(pmids) != elastic.get_count()['count']:
                # check for differences in pmids
                docs = elastic.search().docs
                pmids_in_pub_idx = [getattr(doc, 'pmid') for doc in docs]
                pmids_diff = list(set(pmids) - set(pmids_in_pub_idx))
                self.assertListEqual([], pmids_diff, "PMIDs list empty ("+str(pmids_diff)+")")

            self.assertEqual(len(pmids), elastic.get_count()['count'], 'Count for region publications')
    def test_gene_history_loader(self):
        """ Test the gene history loading. """
        call_command("pipeline", "--steps", "load", sections="GENE_HISTORY", dir=TEST_DATA_DIR, ini=MY_INI_FILE)

        INI_CONFIG = IniParser().read_ini(MY_INI_FILE)
        idx = INI_CONFIG["GENE_HISTORY"]["index"]
        idx_type = INI_CONFIG["GENE_HISTORY"]["index_type"]
        elastic = Search(idx=idx, idx_type=idx_type)
        Search.index_refresh(idx)

        self.assertTrue(elastic.get_count()["count"] > 1, "Count documents in the index")
        map1_props = Gene.gene_history_mapping(idx, idx_type, test_mode=True).mapping_properties
        map2_props = elastic.get_mapping()
        if idx not in map2_props:
            logger.error("MAPPING ERROR: " + json.dumps(map2_props))
        self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)
    def test_gene_pubs(self):
        ''' Check the difference between the pubs indexed and those from the gene_pub file
        from the NCBI. If the publication pipeline has not been run recently there is likely
        to be a difference. This is allowed for with the NUM_DIFF variable. If there is a
        larger difference than this then the publication pipeline should be run. '''
        ini = IniParser()
        config = ini.read_ini('publications.ini')
        section = config['GENE']

        file_name = 'gene_pub_test.tmp'
        download_file = os.path.join(DiseasePublicationTest.TEST_DATA_DIR, file_name)
        success = FTPDownload().download(urljoin(section['location'], section['files']),
                                         DiseasePublicationTest.TEST_DATA_DIR, file_name=file_name)
        self.assertTrue(success, 'downloaded gene publications file')

        pmids = set()
        with gzip.open(download_file, 'rt') as outf:
            seen_add = pmids.add
            for x in outf:
                if not x.startswith('9606\t'):
                    continue
                pmid = re.split('\t', x)[2].strip()
                if pmid not in pmids:
                    seen_add(pmid)
        pmids = list(pmids)
        elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))),
                                                   sources=['pmid']),
                         idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)
        self.assertLess(len(pmids)-elastic.get_count()['count'], GenePublicationTest.NUM_DIFF,
                        'Count for gene publications')

        # check for differences in pmids
#         pmids_in_idx = []
#
#         def get_pmids(resp_json):
#             hits = resp_json['hits']['hits']
#             pmids_in_idx.extend([getattr(Document(h), "pmid") for h in hits])
#
#         ScanAndScroll.scan_and_scroll(idx=ElasticSettings.idx('PUBLICATION'), call_fun=get_pmids,
#                                       query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))),
#                                                          sources=['pmid']),
#                                       time_to_keep_scoll=30)
#         pmids_diff = list(set(pmids) - set(pmids_in_idx))
#         self.assertLess(len(pmids_diff), GenePublicationTest.NUM_DIFF)
        os.remove(download_file)
    def test_pub_disease_counts(self):
        ''' Check all publications exist in the publication index. '''
        for disease in DiseasePublicationTest.DISEASES:
            pmids = self._get_pmids(disease)
            disease_code = disease.lower()
            elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids)))),
                             idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)
            self.assertEqual(elastic.get_count()['count'], len(pmids), 'Count for '+disease_code)

            # check for differences in pmids
            pmids_in_idx = []

            def get_pmids(resp_json):
                pmids_in_idx.extend([getattr(Document(h), "pmid") for h in resp_json['hits']['hits']])

            ScanAndScroll.scan_and_scroll(idx=ElasticSettings.idx('PUBLICATION'), call_fun=get_pmids,
                                          query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))),
                                                             sources=['pmid']))
            pmids_diff = list(set(pmids) - set(pmids_in_idx))
            self.assertEqual(len(pmids_diff), 0)
Пример #9
0
 def get_docs_count(cls, idx, idx_type):
     '''Get doc counts'''
     elastic = Search(idx=idx, idx_type=idx_type)
     return elastic.get_count()['count']
    def test_gene_pipeline(self):
        """ Test gene pipeline. """

        INI_CONFIG = IniParser().read_ini(MY_INI_FILE)
        idx = INI_CONFIG["ENSEMBL_GENE_GTF"]["index"]
        idx_type = INI_CONFIG["ENSEMBL_GENE_GTF"]["index_type"]

        """ 1. Test ensembl GTF loading. """
        call_command(
            "pipeline", "--steps", "stage", "load", sections="ENSEMBL_GENE_GTF", dir=TEST_DATA_DIR, ini=MY_INI_FILE
        )
        Search.index_refresh(idx)

        elastic = Search(idx=idx, idx_type=idx_type)
        self.assertGreaterEqual(elastic.get_count()["count"], 1, "Count documents in the index")
        map1_props = Gene.gene_mapping(idx, idx_type, test_mode=True).mapping_properties
        map2_props = elastic.get_mapping()
        if idx not in map2_props:
            logger.error("MAPPING ERROR: " + json.dumps(map2_props))
        self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)

        """ 2. Test adding entrez ID to documents """
        call_command("pipeline", "--steps", "load", sections="GENE2ENSEMBL", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertEqual(len(docs), 1)
        self.assertTrue("entrez" in getattr(docs[0], "dbxrefs"))
        self.assertEqual(getattr(docs[0], "dbxrefs")["entrez"], "26191")

        """ 3. Add uniprot and fill in missing entrez fields. """
        call_command(
            "pipeline", "--steps", "download", "load", sections="ENSMART_GENE", dir=TEST_DATA_DIR, ini=MY_INI_FILE
        )
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("DNMT3L", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertTrue("entrez" in getattr(docs[0], "dbxrefs"))
        self.assertTrue("swissprot" in getattr(docs[0], "dbxrefs"))

        """ 4. Add gene synonyms and dbxrefs. """
        call_command("pipeline", "--steps", "load", sections="GENE_INFO", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertTrue("PTPN8" in getattr(docs[0], "synonyms"))

        """ 5. Add PMIDs to gene docs. """
        call_command("pipeline", "--steps", "load", sections="GENE_PUBS", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertGreater(len(getattr(docs[0], "pmids")), 0)

        """ 6. Add ortholog data. """
        call_command("pipeline", "--steps", "load", sections="ENSMART_HOMOLOG", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        dbxrefs = getattr(docs[0], "dbxrefs")
        self.assertTrue("orthologs" in dbxrefs, dbxrefs)
        self.assertTrue("mmusculus" in dbxrefs["orthologs"], dbxrefs)
        self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"])

        query = ElasticQuery.filtered(
            Query.match_all(),
            TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", ["ENSMUSG00000027843"]),
        )
        docs = Search(query, idx=idx, size=1).search().docs
        self.assertEqual(len(docs), 1)

        """ 7. Add mouse ortholog link to MGI """
        call_command("pipeline", "--steps", "load", sections="ENSEMBL2MGI", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        docs = Search(query, idx=idx, size=1).search().docs
        dbxrefs = getattr(docs[0], "dbxrefs")
        self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"])
        self.assertEqual("107170", dbxrefs["orthologs"]["mmusculus"]["MGI"])
 def test_count_with_query(self):
     ''' Test count the number of documents returned by a query. '''
     query = ElasticQuery(Query.term("id", "rs768019142"))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.get_count()['count'] == 1, "Elastic count with a query")
 def test_count(self):
     ''' Test count the number of documents in an index. '''
     elastic = Search(idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.get_count()['count'] > 1, "Elastic count documents in an index")