def test_Gene_format_list_of_ids(self): id_ = ['7347', '50933'] url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=Gene&id={",".join(id_)}&retmode=xml&email={self.email}&tool=ccd.rhpc.nki.nl' exp = [('Gene', url)] formatter = dba.UrlFormatter() res = formatter.format('Gene', id_) self.assertEqual(res, exp)
def test_EMBL_CCDS_RefSeq(self): exp = [ CodingSequence( 'CR456855', 'EMBL', Seq( 'ATGGAGGGTCAACGCTGGCTGCCGCTGGAGGCCAATCCCGAGGTCACCAACCAGTTTCTTAAACAATTAGGTCTACATCCTAACTGGCAATTCGTTGATGTATATGGAATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCTTAA', IUPACUnambiguousDNA()), Seq( 'MEGQRWLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA', ExtendedIUPACProtein())), CodingSequence( 'DQ917642', 'EMBL', Seq( 'ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGAGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGCTGATTTTTCTTTTCAAGTGGCAGCCCGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCCAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTAAGTGTGTTATTGAACTGTACCCATCAGGATGTCCATTTAGGAGAGACATTGTCAGAGTTTAAGGAATTCTCACAAAGTTTTGATGCAGCTATGAAAGGTTTGGCCCTGAGTAATTCGGATGTGATTCGCCAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATGCAAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTACGTTCCTGTGAATGGAAGACTGTACGAATTAGATGGATTAAGAGAAGGACCGATCGATTTAGGTGCATGCAATCAAGATGACTGGATCAGCGCAGTGAGGCCAGTCATAGAAAAAAGGATACAAAAGTACAGTGAAGGTGAAATTCGATTTAACTTAATGGCCATTGTGTCTGACAGGAAAATGATATATGAACAGAAGATAGCAGAGTTACAAAGACAGCTTGCTGAGGAGGAACCCATGGATACAGATCAGGGTAGTAACATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATATAAGATTGAAAACATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAACTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCTCTCGTAGAAAAGGCAAAAGAAAAACAGAATGCGAAGAAGGCACAGGAAACCAAATGA', IUPACUnambiguousDNA()), Seq( 'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDAKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGSNMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK', ExtendedIUPACProtein())), CodingSequence( 'NM_001270952', 'RefSeq', Seq( 'ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG', IUPACUnambiguousDNA()), Seq( 'MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA', ExtendedIUPACProtein())), CodingSequence( 'CCDS73586.1', 'CCDS', Seq('ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG' ), Seq('MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA' )), CodingSequence( 'CCDS86041.1', 'CCDS', Seq('ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGGGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGTTAATTTTTCTTTTCAAGTGGCAGCCAGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCTAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTGAGTGTGTTACTGAACTGTACCCACCAGGATGTCCATTTAGGCGAGACATTATCAGAGTTTAAAGAATTTTCACAAAGTTTTGATGCAGCTATGAAAGGCTTGGCACTGAGCAATTCAGATGTGATTCGACAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATACGAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTATGTTCCTGTTAATGGGAGACTGTATGAATTAGATGGATTAAGAGAAGGACCGATTGATTTAGGTGCATGCAATCAAGATGATTGGATCAGTGCAGTAAGGCCTGTCATAGAAAAAAGGATACAAAAAGACGGGTTTTCACCATGTTGCCCAGGCTGGTCTCAGACTCCTGAGCTCAAGCCATCCGCCTGCCTCGACCTCCCAAAGTGGTACAGTGAAGGTGAAATTCGATTTAATTTAATGGCCATTGTGTCTGACAGAAAAATGATATATGAGCAGAAGATAGCAGAGTTACAAAGACAACTTGCAGAGGAACCCATGGATACAGATCAAGGTAATAGTATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATACAAGATTGAGAATATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAATTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCACTAGTAGAAAAGGCAAAAGAAAAACAGAACGCAAAGAAAGCTCAGGAAACCAAATGA' ), Seq('MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKDGFSPCCPGWSQTPELKPSACLDLPKWYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK' )) ] ids = { 'EMBL': ['CR456855.1', 'DQ917642.1'], 'RefSeq': ['NM_001270952.1'], 'CCDS': ['CCDS73586.1', 'CCDS86041.1'] } formatter = dba.UrlFormatter() queries = [] for database, id_list in ids.items(): queries += formatter.format(database, id_list) loop = asyncio.get_event_loop() fetcher = dba.Entry_fetcher() entries = loop.run_until_complete(fetcher.fetch_all(queries)) splitter = dba.EntrySplitter() entries = splitter.split(entries) loop.close() parser = dba.DnaParser() res = parser.parse(entries) for item in res: self.assertTrue(item in exp) self.assertEqual(len(exp), len(res))
def test_EMBL_format_list_of_ids(self): id_ = ['CR456855.1', 'GBYX01232236.1'] url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={",".join(id_)}&retmode=xml&rettype=gb&email={self.email}&tool=ccd.rhpc.nki.nl' exp = [('EMBL', url)] formatter = dba.UrlFormatter() res = formatter.format('EMBL', id_) self.assertEqual(res, exp)
def test_uniprot_no_format(self): formatter = dba.UrlFormatter() with self.assertRaises(ValueError): formatter.format('Uniprot', 'q9uj41') # no format with self.assertRaises(ValueError): formatter.format('Uniprot', 'q9uj41', format_='notvalid') # unsupported format
def test_RefSeq_format_list_of_ids(self): id_ = ['NM_001270952.1', 'NM_006002.4'] url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={",".join(id_)}&retmode=xml&rettype=gb&email={self.email}&tool=ccd.rhpc.nki.nl' exp = [('RefSeq', url)] formatter = dba.UrlFormatter() res = formatter.format('RefSeq', id_) self.assertEqual(res, exp)
def test_uniprot_all_formats(self): id_ = 'q9uj41' formatter = dba.UrlFormatter() for format_ in ['xml', 'fasta', 'html']: url = f'https://www.uniprot.org/uniprot/{id_}.{format_}' exp = [('Uniprot', url)] res = formatter.format('Uniprot', id_, format_=format_) self.assertEqual(res, exp)
def test_RefSeq_single_id(self): id_ = 'NM_001270952.1' exp = [( 'RefSeq', f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={id_}&retmode=xml&rettype=gb&email={self.email}&tool=ccd.rhpc.nki.nl' )] formatter = dba.UrlFormatter() res = formatter.format('RefSeq', id_) self.assertEqual(res, exp)
def test_Gene_format_single_id(self): id_ = '7347' exp = [( 'Gene', f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=Gene&id={id_}&retmode=xml&email={self.email}&tool=ccd.rhpc.nki.nl' )] formatter = dba.UrlFormatter() res = formatter.format('Gene', id_) self.assertEqual(res, exp)
def test_CCDS_single_id(self): id_ = 'CCDS73586.1' exp = [( 'CCDS', f'https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA={id_}&ORGANISM=0&BUILDS=CURRENTBUILDS' )] formatter = dba.UrlFormatter() res = formatter.format('CCDS', id_) self.assertEqual(res, exp)
def test_CCDS_format_list_of_ids(self): id_list = ['CCDS73586.1', 'CCDS86041.1'] urls = [ f'https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA={id_}&ORGANISM=0&BUILDS=CURRENTBUILDS' for id_ in id_list ] exp = [('CCDS', url) for url in urls] formatter = dba.UrlFormatter() res = formatter.format('CCDS', id_list) self.assertEqual(res, exp)
def test_correct_functioning(self): loop = asyncio.get_event_loop() parser = dba.GeneParser() uf = dba.UrlFormatter() fetcher = dba.Entry_fetcher() query = uf.format('Gene', '27342') res = loop.run_until_complete(fetcher.fetch_all(query)) xml_soup = BeautifulSoup(res[0][1], 'xml') exp = ['CCDS69308.1', 'CCDS5535.1', 'CCDS75610.1'] self.assertEqual(exp.sort(), parser.get_crossreferences(xml_soup).sort())
def test_uniprot_batch(self): formatter = dba.UrlFormatter() id_list = ['q9uj41', 'Q15287', 'Q96RL1'] formats = ['xml', 'html', 'fasta'] # batch added below for f in formats: exp = [( 'Uniprot', f'https://www.uniprot.org/uploadlists?query=q9uj41 Q15287 Q96RL1&from=ACC+ID&to=ACC&format={f}' )] res = formatter.format('Uniprot', id_list, format_=f'batch_{f}') self.assertEqual(res, exp)
def test_format_for_summary(self): query1 = ('EMBL', ['CR456855.1', 'DQ917642.1']) # list of ids query2 = ('RefSeq', 'NM_001270952.1') # single id exp = [ ('Summary', 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&id=CR456855.1,DQ917642.1' ), ('Summary', 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&id=NM_001270952.1' ) ] formatter = dba.UrlFormatter() res = formatter.format_for_summary(*query1) res += formatter.format_for_summary(*query2) self.assertEqual(exp, res)
def _fetch_data(self, database, id_list): loop = asyncio.get_event_loop() uf = dba.UrlFormatter() fetcher = dba.Entry_fetcher() query = uf.format(database, id_list) return loop.run_until_complete(fetcher.fetch_all(query))
def test_invalid_database(self): formatter = dba.UrlFormatter() with self.assertRaises(ValueError): formatter.format('NotADatabase', ['id1', 'id2']) with self.assertRaises(ValueError): formatter.format_for_summary('NotADatabase', ['id1', 'id2'])