def find_match(self, df): """Find a taxId that best describes top hits. Parameters ---------- df : pd.DataFrame hit table Returns ------- str taxId of match, or '0' if not found Notes ----- The best match TaxID is the LCA of top hits. The "top hits" are defined as those whose bit scores are no less than a certain percentage of that of the best hit. This behavior is similar to DIAMOND's taxonomic classification function. """ try: th = df.iloc[0]['score'] * self.match_th except IndexError: return '0' return find_lca(df[df['score'] >= th]['taxid'], self.taxdump)
def infer_close_group(self): """Infer close group automatically. Notes ----- 1. Assign `close_tax` as top-level taxId(s) of the close group. 2. Assign `groups['close']` as all taxIds under the close group. """ mems = [] # start from the LCA of self group cid = find_lca(self.self_tax, self.taxdump) while True: # close group should exclude self group mems = set([cid] + get_descendants(cid, self.taxdump)).difference( self.groups['self']) # stop when size limit is reached if mems and (not self.close_size or len(mems) >= self.close_size): break # move up one level pid = self.taxdump[cid]['parent'] if pid == cid or pid == '0': break cid = pid self.close_tax = [cid] self.groups['close'] = mems
def assign_taxonomy(self): """Assign taxonomy to genomes. """ # take user-defined taxIds of input genomes if self.input_tax: try: self.input_tax = dict_from_param(self.input_tax) except ValueError: if len(self.data) > 1: raise ValueError('Invalid input taxonomy format.') # for single-sample analysis, one can simply enter a taxId self.input_tax = {max(self.data.keys()): self.input_tax} print('User-specified TaxIDs of input genomes:') for sid, tid in sorted(self.input_tax.items()): if tid not in self.taxdump: # TODO: read from both temp and master taxdump raise ValueError('TaxID {} is not present in taxonomy ' 'database.'.format(tid)) print(' {}: {} ({}).'.format(sid, tid, self.taxdump[tid]['name'])) else: self.input_tax = {} # auto-infer taxIds of remaining genomes sids = sorted([x for x in self.data if x not in self.input_tax]) if sids: print('Auto-inferring plausible taxIds for input genomes based on ' 'taxonomy of search results...') for sid in sids: try: tid, cov = self.infer_genome_tax(self.data[sid], self.taxdump, self.input_cov) self.input_tax[sid] = tid except ValueError: raise ValueError('Cannot auto-infer taxonomy for {}. ' 'Please specify manually.'.format(sid)) print(' {}: {} ({}) (covering {:2g}% best hits).'.format( sid, tid, self.taxdump[tid]['name'], cov)) # refine taxonomy database print('Refining taxonomy database...') refine_taxdump(self.sum_taxids(), self.taxdump) add_children(self.taxdump) print('Done. Retained {} taxa.'.format(len(self.taxdump))) # find lowest common ancestor (LCA) of all genomes self.lca = find_lca(self.input_tax.values(), self.taxdump) print('All input genomes belong to {} ({}).'.format( self.lca, describe_taxon(self.lca, self.taxdump)))
def build_taxonmap(self): """Build protein-to-taxonomy map. """ # assign shared protein to lowest common ancestor (LCA) self.taxonmap = { p: max(tids) if len(tids) == 1 else find_lca(tids, self.taxdump) for p, tids in self.p2tids.items() } # write taxonomy map fname = 'taxon.map.gz' with gzip.open(join(self.output, fname), 'wb') as f: for p, tid in sorted(self.taxonmap.items()): f.write(f'{p}\t{tid}\n'.encode()) print(f'Protein-to-taxonomy map written to {fname}.')
def test_find_lca(self): taxdump = taxdump_from_text(taxdump_archaea) self.assertEqual(find_lca(['131567'], taxdump), '131567') self.assertEqual(find_lca(['1935183', '1783276'], taxdump), '2157') self.assertEqual(find_lca(['1935183', '1783276', '1655434'], taxdump), '2157') self.assertEqual(find_lca(['1935183', '1783276', '2157'], taxdump), '2157') self.assertEqual(find_lca(['1935183', '2'], taxdump), '131567') self.assertEqual(find_lca(['1', '2', '1'], taxdump), '1') taxdump['x'] = {'name': 'x', 'parent': 'x'} with self.assertRaises(ValueError) as ctx: find_lca(['2', 'x'], taxdump) msg = 'Cannot find LCA of taxIds in database.' self.assertEqual(str(ctx.exception), msg)