def test_mean_niche_estimator_missing(self): gradient = pd.Series([1, 2, 3, 4, np.nan], index=['s1', 's2', 's3', 's4', 's5']) values = pd.Series([1, 3, 0, 0, 0], index=['s1', 's2', 's3', 's4', 's5']) with self.assertRaises(ValueError): mean_niche_estimator(values, gradient)
def test_mean_niche_estimator_bad_length(self): gradient = pd.Series([1, 2, 3, 4, 5], index=['s1', 's2', 's3', 's4', 's5']) values = pd.Series([1, 3, 0, 0, 0, 0], index=['s1', 's2', 's3', 's4', 's5', 's6']) with self.assertRaises(ValueError): mean_niche_estimator(values, gradient)
def test_mean_niche_estimator_missing(self): gradient = pd.Series( [1, 2, 3, 4, np.nan], index=['s1', 's2', 's3', 's4', 's5']) values = pd.Series( [1, 3, 0, 0, 0], index=['s1', 's2', 's3', 's4', 's5']) with self.assertRaises(ValueError): mean_niche_estimator(values, gradient)
def test_mean_niche_estimator_bad_length(self): gradient = pd.Series( [1, 2, 3, 4, 5], index=['s1', 's2', 's3', 's4', 's5']) values = pd.Series( [1, 3, 0, 0, 0, 0], index=['s1', 's2', 's3', 's4', 's5', 's6']) with self.assertRaises(ValueError): mean_niche_estimator(values, gradient)
def gradient_clustering(table: pd.DataFrame, gradient: NumericMetadataColumn, weighted: bool = True) -> skbio.TreeNode: """ Builds a tree for features based on a gradient. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. gradient : qiime2.NumericMetadataColumn Continuous vector of measurements corresponding to samples. weighted : bool Specifies if abundance or presence/absence information should be used to perform the clustering. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to the gradient. """ c = gradient.to_series() if not weighted: table = (table > 0).astype(np.float) table, c = match(table, c) t = gradient_linkage(table, c, method='average') mean_g = mean_niche_estimator(table, c) mean_g = pd.Series(mean_g, index=table.columns) mean_g = mean_g.sort_values() t = gradient_sort(t, mean_g) return t
def test_mean_niche_estimator2(self): gradient = pd.Series([1, 2, 3, 4, 5], index=['s1', 's2', 's3', 's4', 's5']) values = pd.Series([1, 3, 0, 0, 0], index=['s1', 's2', 's3', 's4', 's5']) m = mean_niche_estimator(values, gradient) self.assertEqual(m, 1.75)
def test_mean_niche_estimator2(self): gradient = pd.Series( [1, 2, 3, 4, 5], index=['s1', 's2', 's3', 's4', 's5']) values = pd.Series( [1, 3, 0, 0, 0], index=['s1', 's2', 's3', 's4', 's5']) m = mean_niche_estimator(values, gradient) self.assertEqual(m, 1.75)
def test_mean_niche_estimator_frame(self): gradient = pd.Series([1, 2, 3, 4, 5], index=['s1', 's2', 's3', 's4', 's5']) values = pd.DataFrame(np.array([[1, 3, 0, 0, 0], [1, 3, 0, 0, 0]]).T, index=['s1', 's2', 's3', 's4', 's5'], columns=['o1', 'o2']) m = mean_niche_estimator(values, gradient) exp = pd.Series([1.75, 1.75], index=['o1', 'o2']) pdt.assert_series_equal(m, exp)
def test_mean_niche_estimator_frame(self): gradient = pd.Series( [1, 2, 3, 4, 5], index=['s1', 's2', 's3', 's4', 's5']) values = pd.DataFrame( np.array([[1, 3, 0, 0, 0], [1, 3, 0, 0, 0]]).T, index=['s1', 's2', 's3', 's4', 's5'], columns=['o1', 'o2']) m = mean_niche_estimator(values, gradient) exp = pd.Series([1.75, 1.75], index=['o1', 'o2']) pdt.assert_series_equal(m, exp)
def gradient_linkage(X, y, method='average'): """ Principal Balance Analysis using Hierarchical Clustering on known gradient. The hierarchy is built based on the values of the samples located along a gradient. Given a feature :math:`x`, the mean gradient values that :math:`x` was observed in is calculated by .. math:: f(g , x) = \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j} Where :math:`N` is the number of samples, :math:`x_i` is the proportion of feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value at sample `i`. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (f(g, x) - f(g, y))^2 If :math:`d(x, y)` is very small, then :math:`x` and :math:`y` are expected to live in very similar positions across the gradient. A hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. y : pd.Series Continuous vector representing some ordering of the features in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree generated from principal balance analysis. See Also -------- mean_niche_estimator """ _X, _y = match(X, y) mean_X = mean_niche_estimator(_X, gradient=_y) dm = DistanceMatrix.from_iterable(mean_X, euclidean) lm = linkage(dm.condensed_form(), method) return TreeNode.from_linkage_matrix(lm, X.columns)
def gradient_clustering(table: pd.DataFrame, gradient: NumericMetadataColumn, ignore_missing_samples: bool = False, weighted: bool = True) -> skbio.TreeNode: """ Builds a tree for features based on a gradient. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. gradient : qiime2.NumericMetadataColumn Continuous vector of measurements corresponding to samples. ignore_missing_samples: bool Whether to except or ignore when there are samples present in the table that are not present in the gradient metadata. weighted : bool Specifies if abundance or presence/absence information should be used to perform the clustering. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to the gradient. """ c = gradient.to_series() if not ignore_missing_samples: difference = set(table.index) - set(c.index) if difference: raise KeyError("There are samples present in the table not " "present in the gradient metadata column. Override " "this error by using the `ignore_missing_samples` " "argument. Offending samples: %r" % ', '.join(sorted([str(i) for i in difference]))) if not weighted: table = (table > 0).astype(float) table, c = match(table, c) t = gradient_linkage(table, c, method='average') mean_g = mean_niche_estimator(table, c) mean_g = pd.Series(mean_g, index=table.columns) mean_g = mean_g.sort_values() t = gradient_sort(t, mean_g) return t
def gradient_linkage(X, y, method='average'): r""" Hierarchical Clustering on known gradient. The hierarchy is built based on the values of the samples located along a gradient. Given a feature :math:`x`, the mean gradient values that :math:`x` was observed in is calculated by .. math:: f(g , x) = \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j} Where :math:`N` is the number of samples, :math:`x_i` is the proportion of feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value at sample `i`. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (f(g, x) - f(g, y))^2 If :math:`d(x, y)` is very small, then :math:`x` and :math:`y` are expected to live in very similar positions across the gradient. A hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. This can be useful for constructing principal balances. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. y : pd.Series Continuous vector representing some ordering of the samples in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree for constructing principal balances. See Also -------- mean_niche_estimator Examples -------- >>> import pandas as pd >>> from gneiss.cluster import gradient_linkage >>> table = pd.DataFrame([[1, 1, 0, 0, 0], ... [0, 1, 1, 0, 0], ... [0, 0, 1, 1, 0], ... [0, 0, 0, 1, 1]], ... columns=['s1', 's2', 's3', 's4', 's5'], ... index=['o1', 'o2', 'o3', 'o4']).T >>> gradient = pd.Series([1, 2, 3, 4, 5], ... index=['s1', 's2', 's3', 's4', 's5']) >>> tree = gradient_linkage(table, gradient) >>> print(tree.ascii_art()) /-o1 /y1------| | \-o2 -y0------| | /-o3 \y2------| \-o4 """ _X, _y = match(X, y) mean_X = mean_niche_estimator(_X, gradient=_y) t = rank_linkage(mean_X) return t