def test_retrieve_hierarchy(self): scrapper = NCBITaxonomyScrapper(self.file) expected_list = [{ 'name': 'cellular organisms', 'rank': 'no rank', 'tax_id': '131567' }, { 'name': 'Bacteria', 'rank': 'superkingdom', 'tax_id': '2' }, { 'name': 'Terrabacteria group', 'rank': 'clade', 'tax_id': '1783272' }, { 'name': 'Firmicutes', 'rank': 'phylum', 'tax_id': '1239' }, { 'name': 'Bacilli', 'rank': 'class', 'tax_id': '91061' }, { 'name': 'Lactobacillales', 'rank': 'order', 'tax_id': '186826' }] tested_list = scrapper.retrieve_hierarchy() self.assertListEqual(tested_list, expected_list)
def test_retrieve_current_item(self): scrapper = NCBITaxonomyScrapper(self.file) expected_dict = { 'rank': 'family', 'tax_id': '33958', 'name': 'Lactobacillaceae' } tested_dict = scrapper.retrieve_current_item() self.assertDictEqual(tested_dict, expected_dict)
def get(self, tax_id: int, get_model: bool = True) -> Union[NCBITaxonomyScrapper.model, dict]: """ :param tax_id: NCBI taxonomy ID to retrieve data from :param get_model: return pydantic model (return dict if False) """ full_url = f"https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={tax_id}&mode=info" response = self.session.get(full_url) self.last_url_requested = full_url response.raise_for_status() scrapper = NCBITaxonomyScrapper(response.content) if not scrapper.result_found(): raise requests.exceptions.HTTPError(f"{tax_id} not found in NCBI taxonomy db.") if get_model: return scrapper.validated_entry return scrapper.validated_entry.dict()
def test_retrieve_current_item_variant_1(self): """ Some entry has different display and more information, for instance tax_id 12345 """ file_path = os.path.join(os.path.dirname(__file__), 'data/tax_12345.html') tax_file = open(file_path, "rb") scrapper = NCBITaxonomyScrapper(tax_file) expected_dict = { 'rank': 'species', 'tax_id': '12345', 'name': 'Bacillus virus GA1' } tested_dict = scrapper.retrieve_current_item() self.assertDictEqual(tested_dict, expected_dict)
def test_retrieve_current_item_no_link(self): """ Some entry has their names without a link, for instance tax_id 339588 """ file_path = os.path.join(os.path.dirname(__file__), 'data/tax_339588.html') tax_file = open(file_path, "rb") scrapper = NCBITaxonomyScrapper(tax_file) expected_dict = { 'rank': 'species', 'tax_id': '339588', 'name': 'Peyssonnelia inamoena' } tested_dict = scrapper.retrieve_current_item() self.assertDictEqual(tested_dict, expected_dict)
def test_extract_tax_id_from_url(self): tested_url = "wwwtax.cgi?mode=Undef&id=131567&lvl=3&keep=1&srchmode=1&unlock" expected_id = "131567" tested_id = NCBITaxonomyScrapper.extract_tax_id_from_url(tested_url) self.assertEqual(tested_id, expected_id)
def test_result_found_error_page(self): file_path = os.path.join(os.path.dirname(__file__), 'data/error_page.html') tax_file = open(file_path, "rb") scrapper = NCBITaxonomyScrapper(tax_file) self.assertFalse(scrapper.result_found())
def test_result_found(self): scrapper = NCBITaxonomyScrapper(self.file) self.assertTrue(scrapper.result_found())