def test_genome_lineages(self): me = Database() me.output = self.tmpdir me.taxdump = taxdump_from_text(taxdump_proteo) data = ( ('G1', '1224', ''), # Proteobacteria ('G2', '562', '562'), # Escherichia coli ('G3', '622', '622'), # Shigella dysenteriae ('G4', '548', '548')) # Klebsiella aerogenes me.df = pd.DataFrame(data, columns=['genome', 'taxid', 'species']).set_index('genome') for rank in [ 'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus' ]: me.df[rank] = '' me.genome_lineages() with open(join(self.tmpdir, 'lineages.txt'), 'r') as f: obs = dict(x.split('\t') for x in f.read().splitlines()) proteo = 'k__Bacteria; p__Proteobacteria;' self.assertEqual(obs['G1'], proteo + ' c__; o__; f__; g__; s__') entero = proteo + ' c__Gammaproteobacteria; o__Enterobacterales;' +\ ' f__Enterobacteriaceae;' self.assertEqual(obs['G2'], entero + ' g__Escherichia; s__Escherichia coli') self.assertEqual(obs['G3'], entero + ' g__Shigella; s__Shigella dysenteriae') self.assertEqual(obs['G4'], entero + ' g__Klebsiella; s__Klebsiella aerogenes') remove(join(self.tmpdir, 'lineages.txt'))
def test_compile_database(self): me = Database() me.output = self.tmpdir # don't compile me.compile = 'none' me.compile_database() self.assertListEqual(listdir(self.tmpdir), []) # get database files copy(join(self.datadir, 'DnaK', 'linear.faa'), join(self.tmpdir, 'db.faa')) copy(join(self.datadir, 'DnaK', 'prot2tid.txt'), join(self.tmpdir, 'taxon.map')) makedirs(join(self.tmpdir, 'taxdump')) copy(join(self.datadir, 'DnaK', 'taxdump', 'nodes.dmp'), join(self.tmpdir, 'taxdump', 'nodes.dmp')) copy(join(self.datadir, 'DnaK', 'taxdump', 'names.dmp'), join(self.tmpdir, 'taxdump', 'names.dmp')) with open(join(self.tmpdir, 'taxon.map'), 'r') as f: me.taxonmap = dict(x.split('\t') for x in f.read().splitlines()) # set parameters me.threads = 1 me.tmpdir = self.tmpdir me.makeblastdb = 'makeblastdb' me.diamond = 'diamond' # compile blast database me.compile = 'blast' me.compile_database() self.assertTrue(isdir(join(self.tmpdir, 'blast'))) for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'): self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}'))) rmtree(join(self.tmpdir, 'blast')) # compile diamond database me.compile = 'diamond' me.compile_database() self.assertTrue(isdir(join(self.tmpdir, 'diamond'))) self.assertTrue(isfile(join(self.tmpdir, 'diamond', 'db.dmnd'))) rmtree(join(self.tmpdir, 'diamond')) # compile both databases me.compile = 'both' me.compile_database() self.assertTrue(isdir(join(self.tmpdir, 'blast'))) for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'): self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}'))) self.assertTrue(isdir(join(self.tmpdir, 'diamond'))) self.assertTrue(isfile(join(self.tmpdir, 'diamond', 'db.dmnd'))) rmtree(join(self.tmpdir, 'blast')) rmtree(join(self.tmpdir, 'diamond')) # clean up remove(join(self.tmpdir, 'db.faa')) remove(join(self.tmpdir, 'taxon.map')) rmtree(join(self.tmpdir, 'taxdump'))
def test_build_blast_db(self): me = Database() me.output = self.tmpdir me.makeblastdb = 'makeblastdb' copy(join(self.datadir, 'DnaK', 'linear.faa'), join(self.tmpdir, 'db.faa')) copy(join(self.datadir, 'DnaK', 'prot2tid.txt'), join(self.tmpdir, 'taxon.map')) me.build_blast_db() self.assertTrue(isdir(join(self.tmpdir, 'blast'))) for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'): self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}'))) rmtree(join(self.tmpdir, 'blast')) remove(join(self.tmpdir, 'db.faa')) remove(join(self.tmpdir, 'taxon.map'))
def test_build_blast_db(self): me = Database() me.output = self.tmpdir me.makeblastdb = 'makeblastdb' me.tmpdir = self.tmpdir copy(join(self.datadir, 'DnaK', 'linear.faa'), join(self.tmpdir, 'db.faa')) with open(join(self.datadir, 'DnaK', 'prot2tid.txt'), 'r') as f: me.taxonmap = dict(x.split('\t') for x in f.read().splitlines()) me.build_blast_db() self.assertTrue(isdir(join(self.tmpdir, 'blast'))) for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'): self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}'))) rmtree(join(self.tmpdir, 'blast')) remove(join(self.tmpdir, 'db.faa'))
def test_genome_metadata(self): me = Database() me.output = self.tmpdir me.df = pd.Series({ 'genome': 'G1', 'accession': 'GCF_000123456.1', 'asm_name': 'ASM123v1', 'bioproject': 'PRJNA123456', 'biosample': 'SAMN00123456', 'assembly_level': 'Chromosome', 'organism_name': 'hypothetical organism', 'infraspecific_name': '', 'isolate': '', 'taxid': '12345', 'ftp_path': ('ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/123/' '456/GCF_000123456.1_ASM123v1'), 'proteins': 100, 'residues': 12500, 'whatever': 'nonsense' }).to_frame().T me.genome_metadata() with open(join(self.tmpdir, 'genomes.tsv'), 'r') as f: obs = f.read().splitlines() exp = ('genome', 'proteins', 'residues', 'assembly_level', 'accession', 'bioproject', 'biosample', 'asm_name', 'organism_name', 'infraspecific_name', 'isolate', 'taxid', 'ftp_path') self.assertEqual(obs[0], '\t'.join(exp)) exp = ('G1', '100', '12500', 'Chromosome', 'GCF_000123456.1', 'PRJNA123456', 'SAMN00123456', 'ASM123v1', 'hypothetical organism', '', '', '12345', ('ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/123/456/' 'GCF_000123456.1_ASM123v1')) self.assertEqual(obs[1], '\t'.join(exp)) remove(join(self.tmpdir, 'genomes.tsv'))
def test_build_taxonmap(self): me = Database() me.output = self.tmpdir me.taxdump = taxdump_from_text(taxdump_proteo) me.p2tids = { 'P1': {'766'}, # Rickettsiales 'P2': {'570', '548'}, # Klebsiella 'P3': {'620', '622'}, # Shigella 'P4': {'561', '562'}, # Escherichia 'P5': {'126792', '28211'} } # root me.build_taxonmap() exp = {'P1': '766', 'P2': '570', 'P3': '620', 'P4': '561', 'P5': '1'} self.assertDictEqual(me.taxonmap, exp) with gzip.open(join(self.tmpdir, 'taxon.map.gz'), 'rt') as f: obs = dict(x.split('\t') for x in f.read().splitlines()) self.assertDictEqual(obs, exp) remove(join(self.tmpdir, 'taxon.map.gz'))
def test_build_diamond_db(self): me = Database() me.output = self.tmpdir me.diamond = 'diamond' me.threads = 1 me.tmpdir = self.tmpdir copyfile(join(self.datadir, 'DnaK', 'linear.faa'), join(self.tmpdir, 'db.faa')) with open(join(self.datadir, 'DnaK', 'prot2tid.txt'), 'r') as f: me.taxonmap = dict(x.split('\t') for x in f.read().splitlines()) makedirs(join(self.tmpdir, 'taxdump')) copyfile(join(self.datadir, 'DnaK', 'taxdump', 'nodes.dmp'), join(self.tmpdir, 'taxdump', 'nodes.dmp')) me.build_diamond_db() self.assertTrue(isdir(join(self.tmpdir, 'diamond'))) self.assertTrue(isfile(join(self.tmpdir, 'diamond', 'db.dmnd'))) rmtree(join(self.tmpdir, 'diamond')) remove(join(self.tmpdir, 'db.faa')) remove(join(self.tmpdir, 'taxdump', 'nodes.dmp'))
def test_build_taxdump(self): me = Database() me.output = self.tmpdir me.tmpdir = join(self.datadir, 'DnaK', 'taxdump') me.taxdump = taxdump_from_text(taxdump_proteo) data = ( ('G1', '1224'), # Proteobacteria ('G2', '562'), # Escherichia coli ('G3', '585056'), # E. coli UMN026 ('G4', '1038927')) # E. coli O104:H4 me.df = pd.DataFrame(data, columns=['genome', 'taxid']).set_index('genome') me.build_taxdump() with open(join(self.tmpdir, 'taxdump', 'nodes.dmp'), 'r') as f: obs = set(x.split('\t')[0] for x in f.read().splitlines()) exp = { '1', '131567', '2', '1224', '1236', '91347', '543', '561', '562', '585056', '1038927' } self.assertSetEqual(obs, exp) rmtree(join(self.tmpdir, 'taxdump'))