def test_consistency_unrooted(self): """Test consistency of taxa with a taxa that is only monophyletic in unrooted tree""" seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__' nl.determine_rank_order(seed_con) tipname_map = {'a': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'], 'b': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'], 'c': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'], 'd': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides acidifaciens'], 'e': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides acidifaciens']} tree = nl.load_tree(StringIO(u'((a,b),(c,(d,e)));'), tipname_map) counts = nl.collect_names_at_ranks_counts(tree) nl.decorate_ntips_rank(tree) nl.decorate_name_counts(tree) # determine taxonomic consistency of rooted tree #expected_consistency_index c = Consistency(counts, len(nl.RANK_ORDER)) consistency_index = c.calculate(tree, rooted=True) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 0.66666666) self.assertAlmostEqual(consistency_index[2]['s__Bacteroides acidifaciens'], 1.0) #determine consistency of unrooted tree consistency_index = c.calculate(tree, rooted=False) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 1.0) self.assertAlmostEqual(consistency_index[2]['s__Bacteroides acidifaciens'], 1.0)
def test_consistency_missing(self): """Test consistency of taxa in tree with missing taxa""" seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__' nl.determine_rank_order(seed_con) tipname_map = {'a': ['f__Lachnospiraceae', 'g__Bacteroides', None], 'c': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'], 'b': ['f__Lachnospiraceae', 'g__Bacteroides', None], 'e': [None, None, None], 'd': ['f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus'], 'g': [None, None, None], 'f': ['f__Lachnospiraceae', 'g__Lachnospira', None], 'h': ['f__Lachnospiraceae', 'g__Lachnospira', 's__Bacteroides pectinophilus']} tree = nl.load_tree(StringIO(u'(((a,b),(c,d)),((e,f),(g,h)));'), tipname_map) counts = nl.collect_names_at_ranks_counts(tree) nl.decorate_ntips_rank(tree) nl.decorate_name_counts(tree) # determine taxonomic consistency of rooted tree #expected_consistency_index c = Consistency(counts, len(nl.RANK_ORDER)) consistency_index = c.calculate(tree, rooted=True) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0) self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 1.0) #determine consistency of unrooted tree consistency_index = c.calculate(tree, rooted=False) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0) self.assertAlmostEqual(consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)
def test_consistency_unrooted(self): """Test consistency of taxa with a taxa that is only monophyletic in unrooted tree""" seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__' nl.determine_rank_order(seed_con) tipname_map = { 'a': [ 'f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus' ], 'b': [ 'f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus' ], 'c': [ 'f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus' ], 'd': [ 'f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides acidifaciens' ], 'e': [ 'f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides acidifaciens' ] } tree = nl.load_tree('((a,b),(c,(d,e)));', tipname_map) counts = nl.collect_names_at_ranks_counts(tree) nl.decorate_ntips_rank(tree) nl.decorate_name_counts(tree) # determine taxonomic consistency of rooted tree #expected_consistency_index c = Consistency(counts, len(nl.RANK_ORDER)) consistency_index = c.calculate(tree, rooted=True) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual( consistency_index[2]['s__Bacteroides pectinophilus'], 0.66666666) self.assertAlmostEqual( consistency_index[2]['s__Bacteroides acidifaciens'], 1.0) #determine consistency of unrooted tree consistency_index = c.calculate(tree, rooted=False) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual( consistency_index[2]['s__Bacteroides pectinophilus'], 1.0) self.assertAlmostEqual( consistency_index[2]['s__Bacteroides acidifaciens'], 1.0)
def test_generate_constrings_valid_input(self): """Tests generate_constrings with standard valid input. Checks that our output mirrors nlevel (tax2tree's interface).""" exp = test_results determine_rank_order(test_cons[0].split('\t')[1]) cons_map = load_consensus_map(test_cons, False) tree = load_tree(test_tree, cons_map) obs = generate_constrings(tree, cons_map) self.assertEqual(obs, exp)
def test_consistency_missing(self): """Test consistency of taxa in tree with missing taxa""" seed_con = 'f__Lachnospiraceae; g__Bacteroides; s__' nl.determine_rank_order(seed_con) tipname_map = { 'a': ['f__Lachnospiraceae', 'g__Bacteroides', None], 'c': [ 'f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus' ], 'b': ['f__Lachnospiraceae', 'g__Bacteroides', None], 'e': [None, None, None], 'd': [ 'f__Lachnospiraceae', 'g__Bacteroides', 's__Bacteroides pectinophilus' ], 'g': [None, None, None], 'f': ['f__Lachnospiraceae', 'g__Lachnospira', None], 'h': [ 'f__Lachnospiraceae', 'g__Lachnospira', 's__Bacteroides pectinophilus' ] } tree = nl.load_tree('(((a,b),(c,d)),((e,f),(g,h)));', tipname_map) counts = nl.collect_names_at_ranks_counts(tree) nl.decorate_ntips_rank(tree) nl.decorate_name_counts(tree) # determine taxonomic consistency of rooted tree #expected_consistency_index c = Consistency(counts, len(nl.RANK_ORDER)) consistency_index = c.calculate(tree, rooted=True) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0) self.assertAlmostEqual( consistency_index[2]['s__Bacteroides pectinophilus'], 1.0) #determine consistency of unrooted tree consistency_index = c.calculate(tree, rooted=False) self.assertAlmostEqual(consistency_index[0]['f__Lachnospiraceae'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Bacteroides'], 1.0) self.assertAlmostEqual(consistency_index[1]['g__Lachnospira'], 1.0) self.assertAlmostEqual( consistency_index[2]['s__Bacteroides pectinophilus'], 1.0)
def __call__(self, seq_path=None, result_path=None, log_path=None): """Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq Keep in mind, "confidence" is only done for consistency and in fact all assignments will have a score of 0 because a method for determining confidence is not currently implemented. Parameters: seq_path: path to file of sequences. The sequences themselves are never actually used, but they are needed for their ids. result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which should include dump of params. """ # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) with open(seq_path, 'U') as f: seqs = dict(MinimalFastaParser(f)) consensus_map = tax2tree.prep_consensus( open(self.Params['id_to_taxonomy_fp']), seqs.keys()) seed_con = consensus_map[0].strip().split('\t')[1] determine_rank_order(seed_con) tipnames_map = load_consensus_map(consensus_map, False) tree = load_tree(open(self.Params['tree_fp']), tipnames_map) results = tax2tree.generate_constrings(tree, tipnames_map) results = tax2tree.clean_output(results, seqs.keys()) if result_path: # if the user provided a result_path, write the # results to file with open(result_path, 'w') as f: for seq_id, (lineage, confidence) in results.iteritems(): f.write('%s\t%s\t%s\n' % (seq_id, lineage, confidence)) logger.info('Result path: %s' % result_path) return results
def __call__(self, seq_path=None, result_path=None, log_path=None): """Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq Keep in mind, "confidence" is only done for consistency and in fact all assignments will have a score of 0 because a method for determining confidence is not currently implemented. Parameters: seq_path: path to file of sequences. The sequences themselves are never actually used, but they are needed for their ids. result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which should include dump of params. """ # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) with open(seq_path, 'U') as f: seqs = dict(parse_fasta(f)) consensus_map = tax2tree.prep_consensus( open(self.Params['id_to_taxonomy_fp']), seqs.keys()) seed_con = consensus_map[0].strip().split('\t')[1] determine_rank_order(seed_con) tipnames_map = load_consensus_map(consensus_map, False) tree = load_tree(open(self.Params['tree_fp']), tipnames_map) results = tax2tree.generate_constrings(tree, tipnames_map) results = tax2tree.clean_output(results, seqs.keys()) if result_path: # if the user provided a result_path, write the # results to file with open(result_path, 'w') as f: for seq_id, (lineage, confidence) in results.iteritems(): f.write('%s\t%s\t%s\n' % (seq_id, lineage, confidence)) logger.info('Result path: %s' % result_path) return results
def flat_errors(tax_lines): """Flat file errors""" inc_prefix = 'Incorrect prefixes' inc_nlevel = 'Incorrect number of levels' inc_gap = 'Gaps in taxonomy' seed_con = tax_lines[0].strip().split('\t')[1] rank_order = determine_rank_order(seed_con) nlevels = len(rank_order) errors = defaultdict(list) errors_seen = defaultdict(set) for line in tax_lines: id_, parsed = check_parse(line) if not check_prefixes(parsed, rank_order): if parsed not in errors_seen[inc_prefix]: errors_seen[inc_prefix].add(parsed) errors[inc_prefix].append(id_) if not check_n_levels(parsed, nlevels): if parsed not in errors_seen[inc_nlevel]: errors_seen[inc_nlevel].add(parsed) errors[inc_nlevel].append(id_) if not check_gap(parsed): gap_idx = find_gap(parsed) taxon_following_gap = gap_idx + 1 # another +1 as the slice is exclusive if parsed[:taxon_following_gap + 1] not in errors_seen[inc_gap]: errors_seen[inc_gap].add(parsed[:taxon_following_gap + 1]) errors['Gaps in taxonomy'].append(id_) return errors