def test_sum_taxids(self): me = Analyze() me.input_tax = {'S1': '1', 'S2': '3'} def _hits_df(d): return pd.Series(d, name='taxid').to_frame() me.data = { 'S1': [{ 'hits': _hits_df({ 'a': '4', 'b': '6' }) }, { 'hits': _hits_df({ 'a': '4', 'c': '8' }) }], 'S2': [{ 'hits': _hits_df({ 'b': '6', 'd': '1' }) }] } obs = me.sum_taxids() exp = {'1', '3', '4', '6', '8'} self.assertSetEqual(obs, exp)
def test_assign_taxonomy(self): # input are two genomes with defined taxonomy me = Analyze() me.input_tax = 'S1:561,S2:620' # Escherichia and Shigella me.data = {} me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() # test input taxonomy extraction self.assertDictEqual(me.input_tax, {'S1': '561', 'S2': '620'}) # test taxonomy refinement exp = { '1', '131567', '2', '1224', '1236', '91347', '543', '561', '620' } self.assertSetEqual(set(me.taxdump.keys()), exp) # test LCA discovery self.assertEqual(me.lca, '543') # helper for making hit table def _hits_df(d): return pd.Series(d, name='taxid', dtype=object).to_frame() # input is one genome with defined taxonomy me = Analyze() me.data = {'S1': [{'hits': pd.DataFrame(columns=['taxid'])}]} me.input_tax = '561' # Escherichia me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() self.assertDictEqual(me.input_tax, {'S1': '561'}) # input taxonomy not found in database me.input_tax = '1234' me.taxdump = taxdump_from_text(taxdump_proteo) with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'TaxID 1234 is not present in taxonomy database.' self.assertEqual(str(ctx.exception), msg) # input are two genome whose taxonomies are to be inferred based on # search results me = Analyze() me.input_tax = None me.data = { 'S1': [{ 'hits': _hits_df({ 'P1': '561', 'P2': '562' }) }, { 'hits': _hits_df({ 'P3': '543', 'P4': '561' }) }], 'S2': [{ 'hits': _hits_df({ 'P5': '562', 'P6': '585056' }) }, { 'hits': _hits_df({ 'P7': '561', 'P8': '1038927' }) }, { 'hits': _hits_df({'P9': '2580236'}) }] } me.input_cov = 75 me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() self.assertDictEqual(me.input_tax, {'S1': '543', 'S2': '561'}) self.assertEqual(me.lca, '543') # cannot auto-infer taxonomy me.data['S3'] = [{'hits': _hits_df({})}] me.taxdump = taxdump_from_text(taxdump_proteo) with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'Cannot auto-infer taxonomy for S3. Please specify manually.' self.assertEqual(str(ctx.exception), msg) # invalid input taxonomy string me.input_tax = '561' with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'Invalid input taxonomy format.' self.assertEqual(str(ctx.exception), msg)