def test_filter_genomes(self): me = Database() header = ('# assembly_accession', 'assembly_level') data = (('GCF_000000001.1', 'Chromosome'), ('GCF_000000002.1', 'Complete Genome'), ('GCF_000000003.2', 'Scaffold'), ('GCF_000000004.1', 'Contig'), ('GCA_000000004.1', 'Contig')) df = pd.DataFrame(data, columns=header) me.complete = False me.genoids = None me.exclude = False # drop duplicates me.df = df.copy() me.filter_genomes() self.assertEqual(me.df.shape[0], 4) self.assertListEqual( me.df['genome'].tolist(), ['G000000001', 'G000000002', 'G000000003', 'G000000004']) self.assertEqual( me.df.query('accession == "GCF_000000004.1"').shape[0], 1) # complete genomes only me.complete = True me.df = df.copy() me.filter_genomes() self.assertListEqual(me.df['accnov'].tolist(), ['GCF_000000001', 'GCF_000000002']) # include certain genomes me.complete = False me.genoids = 'G000000001,G000000003' me.df = df.copy() me.filter_genomes() self.assertListEqual(me.df['accession'].tolist(), ['GCF_000000001.1', 'GCF_000000003.2']) # exclude certain genomes me.genoids = ['GCF_000000002.1', 'GCF_000000004'] me.exclude = True me.df = df.copy() me.filter_genomes() self.assertListEqual(me.df['accession'].tolist(), ['GCF_000000001.1', 'GCF_000000003.2'])
def test_identify_taxonomy(self): me = Database() header = ('organism_name', 'taxid', 'species', 'species_taxid') data = (('Escherichia coli UMN026', '585056', 'E. coli', '562'), ('Escherichia coli O104:H4', '1038927', 'E. coli', '562'), ('Klebsiella aerogenes', '548', 'Klebsiella aerogenes', '548'), ('unclassified Gammaproteobacteria', '118884', '', ''), ('Plasmid pPY113', '126792', '', '')) df = pd.DataFrame(data, columns=header) # organism names must be capital and latinate me.capital = True me.block = None me.latin = True me.taxids = None me.exclude = False me.taxdump = taxdump_from_text(taxdump_proteo) me.df = df.copy() me.identify_taxonomy() self.assertNotIn('species_taxid', me.df.columns) self.assertListEqual(me.df.index.tolist(), [0, 1, 2]) self.assertListEqual(me.df['species'].tolist(), ['562', '562', '548']) # block word me.block = 'plasmid' me.latin = False me.df = df.copy() me.identify_taxonomy() self.assertListEqual(me.df.index.tolist(), [0, 1, 2]) # no Escherichia me.taxids = '561' me.exclude = True me.df = df.copy() me.identify_taxonomy() self.assertListEqual(me.df.index.tolist(), [2])