def _lineage(TreeNode): lineage = [node.name for node in TreeNode.ancestors()] lineage = lineage[::-1] #lowest to highest node order if not TreeNode.is_tip(): lineage.append(TreeNode.name) lineage = ";".join(lineage[1:]) #first node -- the root -- has no name return lineage
def test_defaults(self): exp_basis = pd.read_csv( self.get_data_path('expected/categorical_basis.tsv'), sep='\t') exp_data = pd.read_csv( self.get_data_path('expected/categorical_data.tsv'), sep='\t') exp_groups = pd.read_csv( self.get_data_path('expected/categorical_groups.tsv'), sep='\t') exp_factors = pd.read_csv( self.get_data_path('expected/categorical_factors.tsv'), sep='\t') exp_tree = TreeNode.read( self.get_data_path('expected/categorical_tree.nwk')) pf = phylofactor(self.table, self.phylogeny, self.metadata, formula='Categorical~Data', nfactors=3, family='binomial') data, basis, out_tree, groups, factors = pf assert_frame_equal(basis, exp_basis) assert_frame_equal(groups, exp_groups) assert_frame_equal(factors, exp_factors) assert_frame_equal(data, exp_data) self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)
def test_continous(self): exp_basis = pd.read_csv( self.get_data_path('expected/numeric_basis.tsv'), sep='\t') exp_data = pd.read_csv(self.get_data_path('expected/numeric_data.tsv'), sep='\t') exp_groups = pd.read_csv( self.get_data_path('expected/numeric_groups.tsv'), sep='\t') exp_factors = pd.read_csv( self.get_data_path('expected/numeric_factors.tsv'), sep='\t') exp_tree = (TreeNode.read( self.get_data_path('expected/numeric_tree.nwk'))) pf = phylofactor(self.table, self.phylogeny, self.metadata, formula='Continuous~Data', nfactors=3, family='poisson') data, basis, out_tree, groups, factors = pf assert_frame_equal(basis, exp_basis) assert_frame_equal(groups, exp_groups) assert_frame_equal(factors, exp_factors) assert_frame_equal(data, exp_data) self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)
def main_calc_tree_distance(lang_set_mat, dist_metric="rfd"): """Calculate Tree Distance.""" pred_linkage = get_linkage_matrix(lang_set_mat) pred_tree = TreeNode.from_linkage_matrix(pred_linkage, INDO_EURO_LANG_NAMES) pred_tree_string_io = StringIO() pred_tree.write(pred_tree_string_io) pred_tree_string = pred_tree_string_io.getvalue() # Replace distances with 1 unweighted_tree_string = re.sub(r"\d+\.\d+", "1", pred_tree_string) pred_tree = TreeNode.read(StringIO(unweighted_tree_string)) if dist_metric == "rfd": tree_dist = pred_tree.compare_rfd(GT_INDO_EUROPEAN_TREE) else: gt_distances_struct = GT_INDO_EUROPEAN_TREE.tip_tip_distances() gt_distances = gt_distances_struct.data gt_ids = gt_distances_struct.ids pred_distances = pred_tree.tip_tip_distances( endpoints=list(gt_ids)).data tree_dist = np.sum((gt_distances - pred_distances)**2) return tree_dist, pred_tree
def unifrac(classifications, weighted=True, field='readcount_w_children', rank='species', strict=False): """ A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence """ assert field in ACCEPTABLE_FIELDS counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) tree = None for c in classifications: if strict and c.job.id != classifications[0].job.id: raise OneCodexException('All Classifications must have the same Job for Unifrac') tree = generate_skbio_tree(c, existing_tree=tree) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here new_tree = TreeNode(name='fake root') new_tree.rank = 'no rank' new_tree.append(tree) # prune low-level nodes off the tree so the tips are what we're comparing prune_to_rank(new_tree, rank=rank) if weighted: return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids) else: return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids)
def _newick_to_tree_node(fh, convert_underscores=True): tree_stack = [] current_depth = 0 last_token = '' next_is_distance = False root = TreeNode() tree_stack.append((root, current_depth)) for token in _tokenize_newick(fh, convert_underscores=convert_underscores): # Check for a label if last_token not in '(,):': if not next_is_distance: tree_stack[-1][0].name = last_token if last_token else None else: next_is_distance = False # Check for a distance if token == ':': next_is_distance = True elif last_token == ':': try: tree_stack[-1][0].length = float(token) except ValueError: raise NewickFormatError("Could not read length as numeric type" ": %s." % token) elif token == '(': current_depth += 1 tree_stack.append((TreeNode(), current_depth)) elif token == ',': tree_stack.append((TreeNode(), current_depth)) elif token == ')': if len(tree_stack) < 2: raise NewickFormatError("Could not parse file as newick." " Parenthesis are unbalanced.") children = [] # Pop all nodes at this depth as they belong to the remaining # node on the top of the stack as children. while current_depth == tree_stack[-1][1]: node, _ = tree_stack.pop() children.insert(0, node) parent = tree_stack[-1][0] if parent.children: raise NewickFormatError("Could not parse file as newick." " Contains unnested children.") # This is much faster than TreeNode.extend for child in children: child.parent = parent parent.children = children current_depth -= 1 elif token == ';': if len(tree_stack) == 1: return root break last_token = token raise NewickFormatError("Could not parse file as newick." " `(Parenthesis)`, `'single-quotes'`," " `[comments]` may be unbalanced, or tree may be" " missing its root.")
def main(): args = parse_args() tree1 = TreeNode.read(open(args.tree1_file)) tree2 = TreeNode.read(open(args.tree2_file)) tree_dist = calc_tree_distance(tree1, tree2) print ("Tree distance: %d" %tree_dist)
def unifrac(self, weighted=True, rank="auto"): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances import skbio.diversity if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def testSistersOneIncompleteSister(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;")) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))
def testNoPairs(self): tree = TreeNode.read( StringIO( "(((A:1, B:2):3, (C:4, D:5):6)'f__family; g__genoos':10)root;") ) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet([], examples)
def test__generate_html_summary_phylogeny(self): fp_biom = join('qtp_biom', 'support_files', 'sepp.biom') fp_tree = join('qtp_biom', 'support_files', 'sepp.tre') # load metadata qurl = '/qiita_db/analysis/%s/metadata/' % 1 md = self.qclient.get(qurl) # load phylogeny tree = TreeNode.read(fp_tree) obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom, md, self.out_dir, True, tree=tree) # test if two expected tags show up in the html summary page with open(obs_index_fp) as f: obs_html = ''.join(f.readlines()) self.assertTrue('<th>Number placed fragments</th>' in obs_html) self.assertTrue('<td>434</td>' in obs_html) # test that phylogeny specific html content does not show up if no # tree is given obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom, md, self.out_dir, True, tree=None) with open(obs_index_fp) as f: obs_html = ''.join(f.readlines()) self.assertTrue('<th>Number placed fragments</th>' not in obs_html)
def build_base_silva_taxonomy(tree_file, tax_dict): """Returns {TaxonomyID : [(rank, taxonomy), ...]} """ print("Building base SILVA taxonomy...") tree = TreeNode.read(tree_file) ml = {} for node in tree.postorder(): # tree.tips(): if node.is_root(): break l = [] rank, taxonomy = tax_dict[node.name] clean_taxonomy_str = filter_characters(taxonomy) if rank in allowed_ranks: l.append((allowed_ranks_dict[rank], clean_taxonomy_str)) for ancestor in node.ancestors(): if ancestor.is_root(): break else: arank, ataxonomy = tax_dict[ancestor.name] cleaned_ataxonomy = filter_characters(ataxonomy) if arank in allowed_ranks: l.append((allowed_ranks_dict[arank], cleaned_ataxonomy)) #l.reverse() ml[node.name.strip()] = dict(l) return ml
def testSimpleTwice(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds( tree, [0.25, 0.25]) self.assertSameClusterSets( [[0.25, [['A', 'B'], ['D']]], [0.25, [['A', 'B'], ['D']]]], clusters)
def _build_trees(clade_counts, edge_lengths, support_attr): """Construct the trees with support Parameters ---------- clade_counts : dict Keyed by the frozenset of the clade and valued by the support edge_lengths : dict Keyed by the frozenset of the clade and valued by the weighted length support_attr : str The name of the attribute to hold the support value Returns ------- list of TreeNode A list of the constructed trees """ nodes = {} queue = [(len(clade), clade) for clade in clade_counts] while queue: # The values within the queue are updated on each iteration, so it # doesn't look like an insertion sort will make sense unfortunately queue.sort() (clade_size, clade) = queue.pop(0) new_queue = [] # search for ancestors of clade for (_, ancestor) in queue: if clade.issubset(ancestor): # update ancestor such that, in the following example: # ancestor == {1, 2, 3, 4} # clade == {2, 3} # new_ancestor == {1, {2, 3}, 4} new_ancestor = (ancestor - clade) | frozenset([clade]) # update references for counts and lengths clade_counts[new_ancestor] = clade_counts.pop(ancestor) edge_lengths[new_ancestor] = edge_lengths.pop(ancestor) ancestor = new_ancestor new_queue.append((len(ancestor), ancestor)) # if the clade is a tip, then we have a name if clade_size == 1: name = list(clade)[0] else: name = None # the clade will not be in nodes if it is a tip children = [nodes.pop(c) for c in clade if c in nodes] length = edge_lengths[clade] node = TreeNode(children=children, length=length, name=name) setattr(node, support_attr, clade_counts[clade]) nodes[clade] = node queue = new_queue return list(nodes.values())
def testClusterNamingWithBootstraps(self): tree = TreeNode.read( StringIO( "((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;")) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([['F'], _('A B D H')], clusters) assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
def test_ilr_ordination(self): np.random.seed(0) table = pd.DataFrame([[1, 1, 2, 2], [1, 2, 2, 1], [2, 2, 1, 1]], index=[1, 2, 3], columns=['a', 'b', 'c', 'd']) table = table.reindex(columns=np.random.permutation(table.columns)) tree = TreeNode.read([ '((c:0.025,d:0.025,f:0.1,e:0.025):0.2,(b:0.025,a:0.025):0.2);']) res_ord, res_tree, res_md = ilr_phylogenetic_ordination( table, tree, top_k_var=3) exp_balances = pd.DataFrame( [[0.693147, 0.0, 3.892122e-17], [0.0, -4.901291e-01, -4.901291e-01], [-0.693147, -5.551115e-17, -3.892122e-17]], columns=['y0', 'y1', 'y2'], index=[1, 2, 3]) exp_balances = exp_balances[['y0', 'y1', 'y2']] exp_balances.index.name = 'sampleid' pdt.assert_frame_equal(res_ord.samples, exp_balances) exp_tree_str = ('((b:0.025,a:0.025)y1:0.2,' '(c:0.025,d:0.025)y2:0.2)y0;\n') self.assertEqual(str(res_tree), exp_tree_str) exp_md = pd.DataFrame([[-0.5, -0.707107, 0.000000], [-0.5, 0.707107, 0.000000], [0.5, 0.000000, -0.707107], [0.5, 0.000000, 0.707107]], columns=['y0', 'y1', 'y2'], index=['b', 'a', 'c', 'd']) exp_md.index.name = 'featureid' pdt.assert_frame_equal(res_md, exp_md)
def make_modules(dist, min_dist, obs_ids): # create linkage matrix using complete linkage z = complete(dist) # make tree from linkage matrix with names from dist tree = TreeNode.from_linkage_matrix(z, obs_ids) # get all tips so in the end we can check if we are done all_tips = len([i for i in tree.postorder() if i.is_tip()]) modules = set() seen = set() dist = pd.DataFrame(squareform(dist), index=obs_ids, columns=obs_ids) for node in tree.levelorder(): if node.is_tip(): seen.add(node.name) else: tip_names = frozenset( (i.name for i in node.postorder() if i.is_tip())) if tip_names.issubset(seen): continue dists = (dist.loc[tip1, tip2] > min_dist for tip1, tip2 in combinations(tip_names, 2)) if any(dists): continue else: modules.add(tip_names) seen.update(tip_names) if len(seen) == all_tips: modules = sorted(modules, key=len, reverse=True) return modules raise ValueError("Well, how did I get here?")
def depth_partition(self, input_tree, percentile, output_tree): ''' Attempt to cluster tree with nodes of tip-to-tip distrubution < an nth percentile cutoff of the whole-tree distance distribution. A better description can be found in the citation below. Parameters ---------- tree: skbio TreeNode obj http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode percentile: float The percentile cutoff to use to determine the cutoff from clading from a given node. Clustering method modified from Prosperi et al method: Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011). http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html ''' tree = TreeNode.read(input_tree) tree = tree.root_at_midpoint() cluster_count = 1 clustered = set() clusters = {} logging.debug("Calculating %ith percentile cutoff from root" \ % (percentile)) whole_tree_distribution = self._node_dist(tree) cutoff = np.percentile(whole_tree_distribution, percentile) logging.debug("Cutoff (%ith percentile): %f" % (percentile, cutoff)) for node in tree.preorder(): if node in clustered: continue elif node.is_tip(): continue else: node_distribution = self._node_dist(node) median=np.median(node_distribution) logging.debug("Median of node: %f" % median) if median <= cutoff: logging.debug("Cluster found!") cluster_name = "partition_%i" % (cluster_count) clusters[cluster_name] = [x.name.replace(' ','_') for x in node.tips()] self._rename(node, cluster_name) cluster_count+=1 for descenent in node.traverse(): clustered.add(descenent) logging.info("%i depth cluster(s) found in tree" % (cluster_count-1)) tree.write(output_tree, "newick") logging.debug("Recording tips that were not partitioned") clusters[self.UNCLUSTERED] = [] for tip in tree.tips(): if tip not in clustered: clusters[self.UNCLUSTERED].append(tip.name.replace(' ','_')) return clusters
def depth_partition(self, input_tree, percentile, output_tree): ''' Attempt to cluster tree with nodes of tip-to-tip distrubution < an nth percentile cutoff of the whole-tree distance distribution. A better description can be found in the citation below. Parameters ---------- tree: skbio TreeNode obj http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode percentile: float The percentile cutoff to use to determine the cutoff from clading from a given node. Clustering method modified from Prosperi et al method: Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011). http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html ''' tree = TreeNode.read(input_tree) tree = tree.root_at_midpoint() cluster_count = 1 clustered = set() clusters = {} logging.debug("Calculating %ith percentile cutoff from root" \ % (percentile)) whole_tree_distribution = self._node_dist(tree) cutoff = np.percentile(whole_tree_distribution, percentile) logging.debug("Cutoff (%ith percentile): %f" % (percentile, cutoff)) for node in tree.preorder(): if node in clustered: continue elif node.is_tip(): continue else: node_distribution = self._node_dist(node) median = np.median(node_distribution) logging.debug("Median of node: %f" % median) if median <= cutoff: logging.debug("Cluster found!") cluster_name = "partition_%i" % (cluster_count) clusters[cluster_name] = [ x.name.replace(' ', '_') for x in node.tips() ] self._rename(node, cluster_name) cluster_count += 1 for descenent in node.traverse(): clustered.add(descenent) logging.info("%i depth cluster(s) found in tree" % (cluster_count - 1)) tree.write(output_tree, "newick") logging.debug("Recording tips that were not partitioned") clusters[self.UNCLUSTERED] = [] for tip in tree.tips(): if tip not in clustered: clusters[self.UNCLUSTERED].append(tip.name.replace(' ', '_')) return clusters
def unifrac(self, weighted=True, rank="auto"): """A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def testFullTaxonomy(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;") ) assert_equals('f__family; g__genus2', ann.full_taxonomy(tree, tree.find('D')))
def testSistersSelfNoParent(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals([], [s.name for s in sisters])
def testFindParents(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('B')).name, 'self is named') tree = TreeNode.read(StringIO("(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('2475')).name, 'parent directly above') tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('f__family', ann.find_named_parent(tree, tree.find('2475')).name, 'parent 2 above') tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);")) assert_equals(None, ann.find_named_parent(tree, tree.find('f__family').parent), 'parent of root') tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);")) assert_equals(None, ann.find_named_parent(tree, tree.find('g__genus2').parent), 'no parent before root')
def testClusterNamingConventionsWithSomeUnnamed(self): tree = TreeNode.read( StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters( tree, 0.05) #i.e. everything is a separate cluster self.assertSameClusters([['A'], ['B'], ['D'], ['F']], clusters) assert_equals(['G.1', 'G.2', 'G.3', 'G.4'], [c.name() for c in clusters])
def testMultiplyNamedNode(self): tree = TreeNode.read( StringIO( "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2; s__spec':6)'f__family':10)root;" )) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet( [['f__family', 'g__genus1', 'g__genus2; s__spec', 16.0]], examples)
def generate_html_summary(qclient, job_id, parameters, out_dir): """Generates the HTML summary of a BIOM artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to validate and create the artifact out_dir : str The path to the job's output directory Returns ------- bool, None, str Whether the job is successful Ignored The error message, if not successful """ # Step 1: gather file information from qiita using REST api artifact_id = parameters['input_data'] qclient_url = "/qiita_db/artifacts/%s/" % artifact_id artifact_info = qclient.get(qclient_url) # Step 2: get the mapping file, depends if analysis or not if artifact_info['analysis'] is None: is_analysis = False qurl = ('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) md = qclient.get(qurl)['qiime-map'] else: is_analysis = True qurl = '/qiita_db/analysis/%s/metadata/' % artifact_info['analysis'] md = qclient.get(qurl) tree = None if 'plain_text' in artifact_info['files']: tree = TreeNode.read(artifact_info['files']['plain_text'][0]) # Step 3: generate HTML summary # if we get to this point of the code we are sure that this is a biom file # and that it only has one element index_fp, viz_fp, qza_fp = _generate_html_summary( artifact_info['files']['biom'][0], md, out_dir, is_analysis, tree) # Step 4: add the new file to the artifact using REST api success = True error_msg = "" try: qclient.patch(qclient_url, 'add', '/html_summary/', value=dumps({'html': index_fp, 'dir': viz_fp})) except Exception as e: success = False error_msg = str(e) return success, None, error_msg
def testOppositeSorting(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds( tree, [0.05, 0.25]) self.assertSameClusterSets( [[0.05, [['A'], ['B'], ['D']]], [0.25, [['A', 'B'], ['D']]]], clusters) assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters[0].clusters])
def testTreeSubtree2(self): '''one genus is a subtree of another, and the longest branch is in both subtrees''' tree = TreeNode.read( StringIO( "((((A:1, B:52)'g__genus1':3, D:50)'g__genus2':6)'f__family':10)root;" )) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet( [['f__family', 'g__genus1', 'g__genus2', 105.0]], examples)
def testSistersSisterWithDescendentNames(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;" )) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g2', 'g3']), sorted([s.name for s in sisters]))
def write_tree(): dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() triu = np.square(dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def testNaming(self): tree = TreeNode.read( StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds( tree, [40, 25]) self.assertSameClusterSets( [[25, [['F'], _('A B'), _('D H')]], [40, [['F'], _('A B D H')]]], clusters) assert_equals(_('G.3 G.1 G.2'), [c.name() for c in clusters[0].clusters]) assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters])
def testFindParents(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('B')).name, 'self is named') tree = TreeNode.read( StringIO( "(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('2475')).name, 'parent directly above') tree = TreeNode.read( StringIO( "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) assert_equals('f__family', ann.find_named_parent(tree, tree.find('2475')).name, 'parent 2 above') tree = TreeNode.read( StringIO( "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);")) assert_equals( None, ann.find_named_parent(tree, tree.find('f__family').parent), 'parent of root') tree = TreeNode.read( StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);")) assert_equals( None, ann.find_named_parent(tree, tree.find('g__genus2').parent), 'no parent before root')
def test_missing_taxonomy(self): tree = TreeNode.read( StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;')) assert_equals(['C'], TaxonomyFunctions().missing_taxonomy( tree, tree.find('A'), tree.find('E'))) assert_equals([], TaxonomyFunctions().missing_taxonomy( tree, tree.find('A'), tree.find('A'))) assert_equals(['E', 'C'], TaxonomyFunctions().missing_taxonomy( tree, tree.find('A'), tree.find('G')))
def get_clusters(x_original, axis='row'): """Performs UPGMA clustering using euclidean distances""" x = x_original.copy() if axis == 'column': x = x.T nr = x.shape[0] row_dissims = pw_distances(x, ids=map(str, range(nr)), metric='euclidean') # do upgma - rows # Average in SciPy's cluster.hierarchy.linkage is UPGMA linkage_matrix = linkage(row_dissims.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids) return [int(tip.name) for tip in tree.tips()]
def tree_build(self): """Build a tree from the taxonomy data present in this object. This is designed for use with `ClassificationsDataFrame` or `SampleCollection`. Returns ------- `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current analysis and their parents leading back to the root node. """ from skbio.tree import TreeNode # build all the nodes nodes = {} for tax_id in self.taxonomy.index: node = TreeNode(name=tax_id, length=1) node.tax_name = self.taxonomy["name"][tax_id] node.rank = self.taxonomy["rank"][tax_id] node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id] nodes[tax_id] = node # generate all the links for tax_id in self.taxonomy.index: try: parent = nodes[nodes[tax_id].parent_tax_id] except KeyError: if tax_id != "1": warnings.warn( "tax_id={} has parent_tax_id={} which is not in tree" "".format(tax_id, nodes[tax_id].parent_tax_id)) continue parent.append(nodes[tax_id]) return nodes["1"]
def tree_build(self): """Build a tree from the taxonomy data present in this `ClassificationsDataFrame` or `SampleCollection`. Returns ------- `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current analysis and their parents leading back to the root node. """ from skbio.tree import TreeNode # build all the nodes nodes = {} for tax_id in self.taxonomy.index: node = TreeNode(name=tax_id, length=1) node.tax_name = self.taxonomy["name"][tax_id] node.rank = self.taxonomy["rank"][tax_id] node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id] nodes[tax_id] = node # generate all the links for tax_id in self.taxonomy.index: try: parent = nodes[nodes[tax_id].parent_tax_id] except KeyError: if tax_id != "1": warnings.warn( "tax_id={} has parent_tax_id={} which is not in tree" "".format(tax_id, nodes[tax_id].parent_tax_id) ) continue parent.append(nodes[tax_id]) return nodes["1"]
def get_clusters(x_original, axis=['row', 'column'][0]): """Performs UPGMA clustering using euclidean distances""" x = x_original.copy() if axis == 'column': x = x.T nr = x.shape[0] metric_f = get_nonphylogenetic_metric('euclidean') row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr))) # do upgma - rows # Average in SciPy's cluster.heirarchy.linkage is UPGMA linkage_matrix = linkage(row_dissims.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids) row_order = [int(tip.name) for tip in tree.tips()] return row_order
def testTipToCluster(self): tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [40, 25]) self.assertSameClusterSets([[25,[['F'], _('A B'), _('D H')]], [40,[['F'], _('A B D H')]]], clusters) assert_equals(_('G.3 G.1 G.2'), [c.name() for c in clusters[0].clusters]) assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters]) tip = tree.find('F') assert_equals('G.3', clusters[0].tip_to_cluster(tip).name()) assert_equals('G.2', clusters[1].tip_to_cluster(tip).name()) tip = tree.find('D') assert_equals('G.2', clusters[0].tip_to_cluster(tip).name()) assert_equals('G.1', clusters[1].tip_to_cluster(tip).name())
def load_tree_files(tree_dir): """Load trees from filepaths checks if filenames indicate that trees are from different distance methods. If so, warns user. loads trees into phylonode objects returns [trees] raises a RuntimeError if no trees are loaded """ tree_file_names = os.listdir(tree_dir) # ignore invisible files like .DS_Store tree_file_names = [fname for fname in tree_file_names if not fname.startswith('.')] # try to warn user if using multiple types of trees { try: base_names = [] for fname in tree_file_names: base_names.append(parse_rarefaction_fname(fname)[0]) except ValueError: pass else: if len(set(base_names)) > 1: warnstr = """ warning: trees are named differently, please be sure you're not comparing trees generated in different manners, unless you're quite sure that's what you intend to do. types: """ + str(set(base_names)) + """ continuing anyway...""" warn(warnstr) # } trees = [] for fname in tree_file_names: try: f = open(os.path.join(tree_dir, fname), 'U') tree = TreeNode.from_newick(f) tree.filepath = fname trees.append(tree) f.close() except IOError as err: sys.stderr.write('error loading tree ' + fname + '\n') exit(1) if len(trees) == 0: raise RuntimeError('Error: no trees loaded' + ', check that tree directory has has valid trees') return trees
def write_tree(cluster_method): import scipy.spatial.distance as ssd dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() #triu = np.square(dmx.as_matrix()) triu = np.square(dmx.values) distArray = ssd.squareform(triu) if cluster_method == "average": hclust = average(distArray) elif cluster_method == "weighted": hclust = weighted(distArray) else: print("invalid cluster method chosen") sys.exit() t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def single_file_upgma(input_file, output_file): # read in dist matrix dist_mat = DistanceMatrix.read(input_file) # SciPy uses average as UPGMA: # http://docs.scipy.org/doc/scipy/reference/generated/ # scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage linkage_matrix = linkage(dist_mat.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids) # write output f = open(output_file, 'w') try: f.write(tree.to_newick(with_distances=True)) except AttributeError: if c is None: raise RuntimeError("""input file %s did not make a UPGMA tree. Ensure it has more than one sample present""" % (str(input_file),)) raise f.close()
def _open_tree(self, tree_path): ''' Open a tree file, determine what decorations are already present. Strip Unwanted decoration Parameters ---------- tree_path: str Path to a file containing a phylogenetic tree, in Newick format. Returns ------- skbio TreeNode object ''' tree_obj=TreeNode.read(open(tree_path)) bootstrapped = True for node in tree_obj.non_tips(): if node.name: try: float(node.name) except: logging.debug("Tree is decorated already. Stripping all \ previous decoration from the tree.") bootstrapped = False tree_obj = self._strip_tree(tree_obj) break else: if bootstrapped: logging.warning("This tree doesn't appear correctly \ formatted or there is information missing. No boostrap value or decoration \ found for bare node. ") bootstrapped = False if bootstrapped: logging.debug("Tree is bootstrap or has confidence values \ assigned to the nodes.") return tree_obj
def test_missing_taxonomy(self): tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;')) assert_equals(['C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('E'))) assert_equals([], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('A'))) assert_equals(['E','C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('G')))
def result_constructor(x): return TreeNode.read(StringIO(x), format='newick')
def testSimple(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.25) self.assertSameClusters([['A','B'],['D']], clusters)
def testNoClusering(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.05) self.assertSameClusters([['A'],['B'],['D']], clusters) assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters])
def testClusterEverything(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.5) self.assertSameClusters([['A','B','D']], clusters) assert_equals('Root',clusters[0].name())
def testClusterOnTwoInternalNodes(self): tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([_('A B D H'), ['F']], clusters)
def testFullTaxonomy(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('f__family; g__genus2', ann.full_taxonomy(tree, tree.find('D')))
def testNamingWithBootstraps(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)0.091:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.05) self.assertSameClusters([['A'],['B'],['D']], clusters) assert_equals(_('Root.1 Root.2 Root.3'), [c.name() for c in clusters])
def testClusterNamingWithBootstraps(self): tree = TreeNode.read(StringIO("((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;")) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([['F'], _('A B D H')], clusters) assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
def testClusterNamingConventionsWithSomeUnnamed(self): tree = TreeNode.read(StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 0.05) #i.e. everything is a separate cluster self.assertSameClusters([['A'],['B'],['D'],['F']], clusters) assert_equals(['G.1', 'G.2', 'G.3', 'G.4'], [c.name() for c in clusters])
def test_run_pick_de_novo_otus_parallel(self): """run_pick_de_novo_otus generates expected results in parallel """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0], 'reference_seqs_fp': self.test_data['refseqs'][0]} self.params['align_seqs'] = \ {'template_fp': self.test_data['refseqs_aligned'][0]} self.params['filter_alignment'] = \ {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]} actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=True, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out, 'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' % input_file_basename) failures_fp = join(self.test_out, 'pynast_aligned_seqs', '%s_rep_set_failures.fasta' % input_file_basename) taxonomy_assignments_fp = join(self.test_out, 'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out, 'otu_table.biom') tree_fp = join(self.test_out, 'rep_set.tre') self.assertEqual(actual_tree_fp, tree_fp) self.assertEqual(actual_otu_table_fp, otu_table_fp) # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus, 14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines), num_otus) # number of seqs which aligned + num of seqs which failed to # align sum to the number of OTUs self.assertEqual( count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus) # number of tips in the tree equals the number of sequences that # aligned with open(tree_fp) as f: tree = TreeNode.from_newick(f) self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0]) # parse the otu table otu_table = load_table(otu_table_fp) expected_sample_ids = [ 'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'] # sample IDs are as expected self.assertItemsEqual(otu_table.ids(), expected_sample_ids) # otu ids are as expected self.assertItemsEqual(otu_table.ids(axis='observation'), otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iter_data()]) self.assertEqual( number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0]) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out, 'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0)
def test_run_pick_de_novo_otus_muscle(self): """run_pick_de_novo_otus w muscle generates expected results """ self.params['assign_taxonomy'] = \ {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0], 'reference_seqs_fp': self.test_data['refseqs'][0]} self.params['align_seqs'] = {'alignment_method': 'muscle'} self.params['filter_alignment'] = \ {'suppress_lane_mask_filter': None, 'entropy_threshold': '0.10'} run_pick_de_novo_otus( self.test_data['seqs'][0], self.test_out, call_commands_serially, self.params, self.qiime_config, parallel=False, status_update_callback=no_status_updates) input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0] otu_map_fp = join(self.test_out, 'uclust_picked_otus', '%s_otus.txt' % input_file_basename) alignment_fp = join(self.test_out, 'muscle_aligned_seqs', '%s_rep_set_aligned.fasta' % input_file_basename) taxonomy_assignments_fp = join(self.test_out, 'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' % input_file_basename) otu_table_fp = join(self.test_out, 'otu_table.biom') tree_fp = join(self.test_out, 'rep_set.tre') # Number of OTUs falls within a range that was manually # confirmed otu_map_lines = list(open(otu_map_fp)) num_otus = len(otu_map_lines) otu_map_otu_ids = [o.split()[0] for o in otu_map_lines] self.assertEqual(num_otus, 14) # all otus get taxonomy assignments taxonomy_assignment_lines = list(open(taxonomy_assignments_fp)) self.assertEqual(len(taxonomy_assignment_lines), num_otus) # all OTUs align self.assertEqual(count_seqs(alignment_fp)[0], num_otus) # all OTUs in tree with open(tree_fp) as f: tree = TreeNode.from_newick(f) self.assertEqual(len(list(tree.tips())), num_otus) # check that the two final output files have non-zero size self.assertTrue(getsize(tree_fp) > 0) self.assertTrue(getsize(otu_table_fp) > 0) # Check that the log file is created and has size > 0 log_fp = glob(join(self.test_out, 'log*.txt'))[0] self.assertTrue(getsize(log_fp) > 0) # parse the otu table otu_table = load_table(otu_table_fp) expected_sample_ids = [ 'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'] # sample IDs are as expected self.assertItemsEqual(otu_table.ids(), expected_sample_ids) # expected OTUs self.assertItemsEqual(otu_table.ids(axis='observation'), otu_map_otu_ids) # number of sequences in the full otu table equals the number of # input sequences number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iter_data()]) self.assertEqual( number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])
def testSistersSelfNoParent(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals([], [s.name for s in sisters])
def testSistersOneIncompleteSister(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;")) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))
def testClusterNamingOnTwoInternalNodesReverseOrder(self): tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([['F'], _('A B D H')], clusters) assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
def testClusterIntoThree(self): tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 25) self.assertSameClusters([_('A B'), _('D H'), ['F']], clusters)
def testSistersSisterWithDescendentNames(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;")) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g2','g3']), sorted([s.name for s in sisters]))