Пример #1
0
 def test_varmat1(self):
     X = pd.DataFrame({'x': np.arange(1, 10), 'y': np.arange(2, 11)})
     res = variation_matrix(X)
     exp = DistanceMatrix(
         [[0, 0.032013010420979787 / 2], [0.032013010420979787 / 2, 0]],
         ids=['x', 'y'])
     self.assertEqual(str(res), str(exp))
Пример #2
0
 def test_varmat1(self):
     X = pd.DataFrame({'x': np.arange(1, 10),
                       'y': np.arange(2, 11)})
     res = variation_matrix(X)
     exp = DistanceMatrix([[0, 0.032013010420979787 / 2],
                           [0.032013010420979787 / 2, 0]], ids=['x', 'y'])
     self.assertEqual(str(res), str(exp))
Пример #3
0
    def test_varmat_larg(self):
        np.random.seed(123)
        D = 50
        N = 100
        mean = np.ones(D) * 10
        cov = np.eye(D)
        n__ = np.random.multivariate_normal(mean, cov, size=N)
        X = pd.DataFrame(np.abs(n__), columns=np.arange(D).astype(np.str))
        res = variation_matrix(X)

        exp = DistanceMatrix.read(get_data_path('exp_varmat.txt'))
        self.assertEqual(str(res), str(exp))
Пример #4
0
    def test_varmat_larg(self):
        np.random.seed(123)
        D = 50
        N = 100
        mean = np.ones(D)*10
        cov = np.eye(D)
        X = pd.DataFrame(np.abs(np.random.multivariate_normal(mean, cov,
                                                              size=N)),
                         columns=np.arange(D).astype(np.str))
        res = variation_matrix(X)

        exp = DistanceMatrix.read(get_data_path('exp_varmat.txt'))
        self.assertEqual(str(res), str(exp))
Пример #5
0
def proportional_linkage(X, method='ward'):
    r"""
    Principal Balance Analysis using Hierarchical Clustering
    based on proportionality.

    The hierarchy is built based on the proportionality between
    any two pairs of features.  Specifically the proportionality between
    two features :math:`x` and :math:`y` is measured by

    .. math::
        p(x, y) = var (\ln \frac{x}{y})

    If :math:`p(x, y)` is very small, then :math:`x` and :math:`y`
    are said to be highly proportional. A hierarchical clustering is
    then performed using this proportionality as a distance metric.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    method : str
        Clustering method.  (default='ward')

    Returns
    -------
    skbio.TreeNode
        Tree generated from principal balance analysis.

    References
    ----------

    .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R.
       Principal Balances (2011).

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import proportional_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> tree = proportional_linkage(table+0.1)

    """
    dm = variation_matrix(X)
    lm = linkage(dm.condensed_form(), method=method)
    return TreeNode.from_linkage_matrix(lm, X.columns)
Пример #6
0
def correlation_linkage(X, method='ward'):
    r"""
    Hierarchical Clustering based on proportionality.

    The hierarchy is built based on the correlationity between
    any two pairs of features.  Specifically the correlation between
    two features :math:`x` and :math:`y` is measured by

    .. math::
        p(x, y) = var (\ln \frac{x}{y})

    If :math:`p(x, y)` is very small, then :math:`x` and :math:`y`
    are said to be highly correlation. A hierarchical clustering is
    then performed using this correlation as a distance metric.

    This can be useful for constructing principal balances [1]_.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    method : str
        Clustering method.  (default='ward')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    References
    ----------

    .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R.
       Principal Balances (2011).

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import correlation_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> tree = correlation_linkage(table+0.1)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    dm = variation_matrix(X)
    lm = linkage(dm.condensed_form(), method=method)
    t = TreeNode.from_linkage_matrix(lm, X.columns)
    t = rename_internal_nodes(t)
    return t