def gradient_clustering(table: pd.DataFrame, gradient: MetadataCategory, weighted=True) -> skbio.TreeNode: """ Builds a tree for features based on a gradient. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. gradient : qiime2.MetadataCategory Continuous vector of measurements corresponding to samples. weighted : bool Specifies if abundance or presence/absence information should be used to perform the clustering. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to the gradient. """ c = gradient.to_series() c = c.astype(np.float) if not weighted: table = table > 0 t = gradient_linkage(table, c, method='average') mean_g = mean_niche_estimator(table, c) mean_g = pd.Series(mean_g, index=table.columns) mean_g = mean_g.sort_values() t = gradient_sort(t, mean_g) t = rename_internal_nodes(t) return t
def ilr_phylogenetic( table: pd.DataFrame, tree: skbio.TreeNode, pseudocount: float = 0.5) -> (pd.DataFrame, skbio.TreeNode): t = tree.copy() t.bifurcate() t = rename_internal_nodes(t) return ilr_transform(add_pseudocount(table, pseudocount), t), t
def assign_ids(input_table: pd.DataFrame, input_tree: skbio.TreeNode) -> (pd.DataFrame, skbio.TreeNode): t = input_tree.copy() t.bifurcate() ids = ['%sL-%s' % (i, uuid.uuid4()) for i, n in enumerate(t.levelorder(include_self=True)) if not n.is_tip()] t = rename_internal_nodes(t, names=ids) _table, _t = match_tips(input_table, t) return _table, _t
def assign_ids(input_tree: skbio.TreeNode) -> skbio.TreeNode: t = input_tree.copy() t.bifurcate() ids = [ '%sL-%s' % (i, uuid.uuid4()) for i, n in enumerate(t.levelorder(include_self=True)) if not n.is_tip() ] t = rename_internal_nodes(t, names=ids) return t
def ilr_phylogenetic_differential( differential: pd.DataFrame, tree: skbio.TreeNode) -> (pd.DataFrame, skbio.TreeNode): t = tree.copy() t.bifurcate() diff, _tree = match_tips(differential.T, t) _tree = rename_internal_nodes(_tree) in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] basis = _balance_basis(_tree)[0] basis = pd.DataFrame(basis.T, index=diff.columns, columns=in_nodes) diff_balances = (diff @ basis).T diff_balances.index.name = 'featureid' return diff_balances, t
def rank_linkage(r, method='average'): r""" Hierchical Clustering on feature ranks. The hierarchy is built based on the rank values of the features given an input vector `r` of ranks. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (r(x) - r(y))^2 Where :math:`r(x)` is the rank of the features. Hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. This can be useful for constructing principal balances. Parameters ---------- r : pd.Series Continuous vector representing some ordering of the features in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree for constructing principal balances. Examples -------- >>> import pandas as pd >>> from gneiss.cluster import rank_linkage >>> ranks = pd.Series([1, 2, 4, 5], ... index=['o1', 'o2', 'o3', 'o4']) >>> tree = rank_linkage(ranks) >>> print(tree.ascii_art()) /-o1 /y1------| | \-o2 -y0------| | /-o3 \y2------| \-o4 """ dm = DistanceMatrix.from_iterable(r, euclidean) lm = linkage(dm.condensed_form(), method) t = TreeNode.from_linkage_matrix(lm, r.index) t = rename_internal_nodes(t) return t
def _intersect_of_table_metadata_tree(table, metadata, tree): """ Matches tips, features and samples between the table, metadata and tree. This module returns the features and samples that are contained in all 3 objects. Parameters ---------- table : pd.DataFrame Contingency table where samples correspond to rows and features correspond to columns. metadata: pd.DataFrame Metadata table that contains information about the samples contained in the `table` object. Samples correspond to rows and covariates correspond to columns. tree : skbio.TreeNode Tree object where the leaves correspond to the columns contained in the table. Returns ------- pd.DataFrame Subset of `table` with common row names as `metadata` and common columns as `tree.tips()` pd.DataFrame Subset of `metadata` with common row names as `table` skbio.TreeNode Subtree of `tree` with common tips as `table` """ if np.any(table <= 0): raise ValueError('Cannot handle zeros or negative values in `table`. ' 'Use pseudocounts or ``multiplicative_replacement``.') _table, _metadata = match(table, metadata) _table, _tree = match_tips(_table, tree) non_tips_no_name = [(n.name is None) for n in _tree.levelorder() if not n.is_tip()] if len(non_tips_no_name) == 0: raise ValueError('There are no internal nodes in `tree` after' 'intersection with `table`.') if len(_table.index) == 0: raise ValueError('There are no internal nodes in `table` after ' 'intersection with `metadata`.') if any(non_tips_no_name): _tree = rename_internal_nodes(_tree) return _table, _metadata, _tree
def proportional_clustering(table: pd.DataFrame) -> skbio.TreeNode: """ Builds a tree for features based on a proportionality. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. In addition, the table must have strictly nonzero values. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to proportionality. """ t = proportional_linkage(table) t = rename_internal_nodes(t) return t
def ilr_phylogenetic_ordination( table: pd.DataFrame, tree: skbio.TreeNode, pseudocount: float = 0.5, top_k_var: int = 10, clades: list = None ) -> (OrdinationResults, skbio.TreeNode, pd.DataFrame): t = tree.copy() t.bifurcate() _table, _tree = match_tips(table, t) _tree = rename_internal_nodes(_tree) if not clades: in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] basis = _balance_basis(_tree)[0] _table = add_pseudocount(_table, pseudocount) basis = pd.DataFrame(basis.T, index=_table.columns, columns=in_nodes) balances = np.log(_table) @ basis var = balances.var(axis=0).sort_values(ascending=False) clades = var.index[:top_k_var] balances = balances[clades] basis = basis[clades] else: clades = clades[0].split(',') balances, basis = _fast_ilr(_tree, _table, clades, pseudocount=0.5) var = balances.var(axis=0).sort_values(ascending=False) balances.index.name = 'sampleid' # feature metadata eigvals = var prop = var[clades] / var.sum() balances = OrdinationResults( short_method_name='ILR', long_method_name='Phylogenetic Isometric Log Ratio Transform', samples=balances, features=pd.DataFrame(np.eye(len(clades)), index=clades), eigvals=eigvals, proportion_explained=prop) basis.index.name = 'featureid' return balances, _tree, basis
def test_rename_internal_nodes_mutable(self): tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) rename_internal_nodes(tree, inplace=True) self.assertEqual(str(tree), "(((a,b)y2,c)y1,d)y0;\n")
def test_rename_internal_nodes_names_mismatch(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) with self.assertRaises(ValueError): rename_internal_nodes(tree, ['r', 'abc'])
def test_rename_internal_nodes_names(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"]) res_tree = rename_internal_nodes(tree, ['r', 'abc', 'ab']) self.assertEqual(str(exp_tree), str(res_tree))
def test_rename_internal_nodes(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"]) res_tree = rename_internal_nodes(tree) self.assertEqual(str(exp_tree), str(res_tree))
def correlation_linkage(X, method='ward'): r""" Hierarchical Clustering based on proportionality. The hierarchy is built based on the correlationity between any two pairs of features. Specifically the correlation between two features :math:`x` and :math:`y` is measured by .. math:: p(x, y) = var (\ln \frac{x}{y}) If :math:`p(x, y)` is very small, then :math:`x` and :math:`y` are said to be highly correlation. A hierarchical clustering is then performed using this correlation as a distance metric. This can be useful for constructing principal balances [1]_. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. method : str Clustering method. (default='ward') Returns ------- skbio.TreeNode Tree for constructing principal balances. References ---------- .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R. Principal Balances (2011). Examples -------- >>> import pandas as pd >>> from gneiss.cluster import correlation_linkage >>> table = pd.DataFrame([[1, 1, 0, 0, 0], ... [0, 1, 1, 0, 0], ... [0, 0, 1, 1, 0], ... [0, 0, 0, 1, 1]], ... columns=['s1', 's2', 's3', 's4', 's5'], ... index=['o1', 'o2', 'o3', 'o4']).T >>> tree = correlation_linkage(table+0.1) >>> print(tree.ascii_art()) /-o1 /y1------| | \-o2 -y0------| | /-o3 \y2------| \-o4 """ dm = variation_matrix(X) lm = linkage(dm.condensed_form(), method=method) t = TreeNode.from_linkage_matrix(lm, X.columns) t = rename_internal_nodes(t) return t
def test_rename_internal_nodes_warning(self): tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) with self.assertWarns(Warning): rename_internal_nodes(tree)