def test_find_match(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) df = pd.DataFrame( [ [100, '585056'], # E. coli UMN026 [99, '1038927'], # E. coli O104:H4 [97, '562'], # Escherichia coli [95, '622'], # Shigella dysenteriae [92, '543'], # Enterobacteriaceae [88, '548'], # Klebsiella aerogenes [80, '766'] ], # Rickettsiales columns=['score', 'taxid']) # keep top 1% hits me.match_th = 0.99 self.assertEqual(me.find_match(df), '562') # keep top 10% hits me.match_th = 0.9 self.assertEqual(me.find_match(df), '543') # keep top 20% hits me.match_th = 0.8 self.assertEqual(me.find_match(df), '1224') # input DataFrame is empty self.assertEqual(me.find_match(pd.DataFrame()), '0')
def test_add_children(self): taxdump = {k: v for k, v in taxdump_archaea.items()} add_children(taxdump) self.assertSetEqual(set(taxdump['2157']['children']), {'1935183', '1783276'}) self.assertSetEqual(set(taxdump['1655434']['children']), {'1655637'}) self.assertListEqual(taxdump['2']['children'], [])
def test_infer_close_group(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = {} # close group is parent of LCA of self group me.self_tax = ['562'] # E. coli me.groups['self'] = set(['562'] + get_descendants('562', me.taxdump)) me.close_tax = None me.close_size = None me.infer_close_group() self.assertListEqual(me.close_tax, ['561']) # Escherichia self.assertSetEqual(me.groups['close'], {'561', '2580236'}) # close group must have at least 5 taxa me.close_tax = None me.groups['close'] = None me.close_size = 5 me.infer_close_group() self.assertListEqual(me.close_tax, ['543']) # Enterobacteriaceae exp = {'543', '620', '622', '570', '548', '561', '2580236'} self.assertSetEqual(me.groups['close'], exp) # close group is LCA of multiple self groups me.self_tax = ['561', '620'] # Escherichia and Shigella me.groups['self'] = set().union(*[[x] + get_descendants(x, me.taxdump) for x in me.self_tax]) me.close_tax = None me.groups['close'] = None me.close_size = None me.infer_close_group() self.assertListEqual(me.close_tax, ['543']) # Enterobacteriaceae exp = {'543', '570', '548'} self.assertSetEqual(me.groups['close'], exp)
def test_infer_self_group(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) # assign to LCA of all genomes (E. coli) me.self_tax = None me.lca = '562' me.self_rank = None me.infer_self_group() self.assertListEqual(me.self_tax, ['562']) # raise LCA to genus level (Escherichia) me.self_tax = None me.lca = '562' me.self_rank = 'genus' me.infer_self_group() self.assertListEqual(me.self_tax, ['561']) # LCA (Enterobacteriaceae) is already above designated rank (genus) me.self_tax = None me.lca = '543' me.self_rank = 'genus' me.infer_self_group() self.assertListEqual(me.self_tax, ['543'])
def test_define_groups(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = {} # user defined groups: # self: genera Escherichia and Shigella # close: family Enterobacteriaceae me.groups = {} me.self_tax = '561,620' me.close_tax = '543' me.define_groups() self.assertListEqual(me.self_tax, ['561', '620']) exp = {'561', '562', '585056', '1038927', '2580236', '620', '622'} self.assertSetEqual(me.groups['self'], exp) self.assertListEqual(me.close_tax, ['543']) exp = {'543', '548', '570'} self.assertSetEqual(me.groups['close'], exp) # auto-infer groups me.self_tax = {} me.close_tax = {} me.lca = '562' # all inputs are E. coli me.self_rank = 'genus' # but we want to raise self to genus me.close_size = 2 # close group must be this big or bigger me.define_groups() self.assertListEqual(me.self_tax, ['561']) exp = {'561', '562', '585056', '1038927', '2580236'} self.assertSetEqual(me.groups['self'], exp) self.assertListEqual(me.close_tax, ['543']) exp = {'543', '548', '570', '620', '622'} self.assertSetEqual(me.groups['close'], exp)
def test_add_children(self): taxdump = taxdump_from_text(taxdump_archaea) add_children(taxdump) self.assertSetEqual(set(taxdump['1']['children']), {'131567'}) self.assertSetEqual(set(taxdump['2157']['children']), {'1935183', '1783276'}) self.assertSetEqual(set(taxdump['1655434']['children']), {'1655637'}) self.assertListEqual(taxdump['2']['children'], [])
def test_calc_scores(self): columns = ('id', 'taxid', 'score') # helper for making hit table def _hits_df(data): return pd.DataFrame(data, columns=columns).set_index('id') me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = { 'self': {'561', '562', '585056'}, 'close': {'543', '91347', '1236'} } me.data = { 'S1': [{ 'score': 100, 'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95))) }, { 'score': 90, 'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72))) }], 'S2': [{ 'score': 96, 'hits': _hits_df( (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66))) }] } me.weighted = True me.match_th = 0.9 me.calc_scores() # helper for get scores def _prot_scores(prot): return [prot[x] for x in ('self', 'close', 'distal')] s1_1 = me.data['S1'][0] self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self']) self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0]) self.assertEqual(s1_1['match'], '0') s1_2 = me.data['S1'][1] self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close']) self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0]) self.assertEqual(s1_2['match'], '0') s2_1 = me.data['S2'][0] self.assertListEqual(s2_1['hits']['group'].tolist(), ['self', 'close', 'distal']) self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875]) self.assertEqual(s2_1['match'], '620')
def assign_taxonomy(self): """Assign taxonomy to genomes. """ # take user-defined taxIds of input genomes if self.input_tax: try: self.input_tax = dict_from_param(self.input_tax) except ValueError: if len(self.data) > 1: raise ValueError('Invalid input taxonomy format.') # for single-sample analysis, one can simply enter a taxId self.input_tax = {max(self.data.keys()): self.input_tax} print('User-specified TaxIDs of input genomes:') for sid, tid in sorted(self.input_tax.items()): if tid not in self.taxdump: # TODO: read from both temp and master taxdump raise ValueError('TaxID {} is not present in taxonomy ' 'database.'.format(tid)) print(' {}: {} ({}).'.format(sid, tid, self.taxdump[tid]['name'])) else: self.input_tax = {} # auto-infer taxIds of remaining genomes sids = sorted([x for x in self.data if x not in self.input_tax]) if sids: print('Auto-inferring plausible taxIds for input genomes based on ' 'taxonomy of search results...') for sid in sids: try: tid, cov = self.infer_genome_tax(self.data[sid], self.taxdump, self.input_cov) self.input_tax[sid] = tid except ValueError: raise ValueError('Cannot auto-infer taxonomy for {}. ' 'Please specify manually.'.format(sid)) print(' {}: {} ({}) (covering {:2g}% best hits).'.format( sid, tid, self.taxdump[tid]['name'], cov)) # refine taxonomy database print('Refining taxonomy database...') refine_taxdump(self.sum_taxids(), self.taxdump) add_children(self.taxdump) print('Done. Retained {} taxa.'.format(len(self.taxdump))) # find lowest common ancestor (LCA) of all genomes self.lca = find_lca(self.input_tax.values(), self.taxdump) print('All input genomes belong to {} ({}).'.format( self.lca, describe_taxon(self.lca, self.taxdump)))
def test_write_hgt_list(self): me = Analyze() me.output = self.tmpdir makedirs(join(me.output, 'hgts'), exist_ok=True) me.donor_name = False me.donor_rank = None me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.df = pd.DataFrame( [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True], ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True], ['S2', 'P5', 0.20, '0', False]], columns=['sample', 'protein', 'silh', 'match', 'hgt']) # default me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # number format and negative result me.write_hgt_list('S2') with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f: self.assertEqual(f.read(), 'P4\t0.8\t766\n') # raise to family me.donor_rank = 'family' me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # report taxon name me.donor_rank = None me.donor_name = True me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\tEscherichia coli\n' 'P2\t0.95\tShigella dysenteriae\n' 'P3\t1.05\tN/A\n') self.assertEqual(obs, exp) rmtree(join(me.output, 'hgts'))
def test_get_descendants(self): taxdump = taxdump_from_text(taxdump_archaea) add_children(taxdump) obs = get_descendants('1935183', taxdump) # Asgard group exp = ['1655434', '1655637', '1538547'] self.assertListEqual(obs, exp)
def test_get_descendants(self): taxdump = {k: v for k, v in taxdump_archaea.items()} add_children(taxdump) obs = get_descendants('1935183', taxdump) # Asgard group exp = ['1655434', '1655637', '1538547'] self.assertListEqual(obs, exp)