def run_test(
    X1,
    X2,
    rows=None,
    info={},
    auto=auto,
    n_bootstraps=n_bootstraps,
    workers=workers,
    test=test,
    print_out=False,
):
    currtime = time.time()
    test_obj = KSample(test)
    tstat, pvalue = test_obj.test(
        X1,
        X2,
        reps=n_bootstraps,
        workers=workers,
        auto=auto,
    )
    elapsed = time.time() - currtime
    row = {
        "pvalue": pvalue,
        "tstat": tstat,
        "elapsed": elapsed,
    }
    row.update(info)
    if print_out:
        pprint.pprint(row)
    if rows is not None:
        rows.append(row)
    else:
        return row
    def fit(self, A1, A2):
        """
        Fits the test to the two input graphs

        Parameters
        ----------
        A1, A2 : variable (see description)
            The two graphs, or their embeddings to run a hypothesis test on.
            Expected variable type and shape depends on input_graph attribute:

            - input_graph=True
                expects two unembedded graphs either as NetworkX graph objects, or as
                two np.ndarrays, representing the adjacency matrices. In this
                case will be embedded using adjacency spectral embedding.
            - input_graph-False
                expects two already embedded graphs. In this case they must be
                arrays of shape (n, d) and (m, d), where d, the number of
                components, must be shared.

            Note that regardless of how the graphs are passed, they need not
            have the same number of vertices.

        Returns
        -------
        self

        """
        if self.input_graph:
            A1 = import_graph(A1)
            A2 = import_graph(A2)

            X1_hat, X2_hat = self._embed(A1, A2)
        else:
            # check for nx objects, since they are castable to arrays,
            # but we don't want that
            if not isinstance(A1, np.ndarray):
                msg = (
                    f"Embedding of the first graph is of type {type(A1)}, not "
                    "np.ndarray. If input_graph is False, the inputs need to be "
                    "adjacency spectral embeddings, with shapes (n, d) and "
                    "(m, d), passed as np.ndarrays.")
                raise TypeError(msg)
            if not isinstance(A2, np.ndarray):
                msg = (
                    f"Embedding of the second graph is of type {type(A2)}, not an "
                    "array. If input_graph is False, the inputs need to be "
                    "adjacency spectral embeddings, with shapes (n, d) and "
                    "(m, d), passed as np.ndarrays.")
                raise TypeError(msg)

            if A1.ndim != 2:
                msg = (
                    "Embedding array of the first graph does not have two dimensions. "
                    "If input_graph is False, the inputs need to be adjacency "
                    "spectral embeddings, with shapes (n, d) and (m, d)")
                raise ValueError(msg)
            if A2.ndim != 2:
                msg = (
                    "Embedding array of the second graph does not have two dimensions. "
                    "If input_graph is False, the inputs need to be adjacency "
                    "spectral embeddings, with shapes (n, d) and (m, d)")
                raise ValueError(msg)
            if A1.shape[1] != A2.shape[1]:
                msg = (
                    "Two input embeddings have different number of components. "
                    "If input_graph is False, the inputs need to be adjacency "
                    "spectral embeddings, with shapes (n, d) and (m, d)")
                raise ValueError(msg)

            # checking for inf values
            X1_hat = check_array(A1)
            X2_hat = check_array(A2)

        if self.align_type == "sign_flips":
            aligner = SignFlips(**self.align_kws)
            X1_hat = aligner.fit_transform(X1_hat, X2_hat)
        elif self.align_type == "seedless_procrustes":
            aligner = SeedlessProcrustes(**self.align_kws)
            X1_hat = aligner.fit_transform(X1_hat, X2_hat)

        if self.size_correction:
            X1_hat, X2_hat = self._sample_modified_ase(X1_hat,
                                                       X2_hat,
                                                       pooled=self.pooled)

        self.metric_func_ = self._instantiate_metric_func(
            self.metric, self.test)
        test_obj = KSample(self.test, compute_distance=self.metric_func_)

        data = test_obj.test(X1_hat,
                             X2_hat,
                             reps=self.n_bootstraps,
                             workers=self.workers,
                             auto=False)

        self.null_distribution_ = test_obj.indep_test.null_dist
        self.sample_T_statistic_ = data[0]
        self.p_value_ = data[1]

        return self
예제 #3
0
def latent_distribution_test(
    A1,
    A2,
    test="dcorr",
    metric="euclidean",
    n_components=None,
    n_bootstraps=500,
    workers=1,
    size_correction=True,
    pooled=False,
    align_type="sign_flips",
    align_kws={},
    input_graph=True,
):
    """Two-sample hypothesis test for the problem of determining whether two random
    dot product graphs have the same distributions of latent positions.

    This test can operate on two graphs where there is no known matching
    between the vertices of the two graphs, or even when the number of vertices
    is different. Currently, testing is only supported for undirected graphs.

    Read more in the :ref:`tutorials <inference_tutorials>`

    Parameters
    ----------
    A1, A2 : variable (see description of 'input_graph')
        The two graphs, or their embeddings to run a hypothesis test on.
        Expected variable type and shape depends on input_graph attribute

    test : str (default="hsic")
        Backend hypothesis test to use, one of ["cca", "dcorr", "hhg", "rv", "hsic", "mgc"].
        These tests are typically used for independence testing, but here they
        are used for a two-sample hypothesis test on the latent positions of
        two graphs. See :class:`hyppo.ksample.KSample` for more information.

    metric : str or function (default="gaussian")
        Distance or a kernel metric to use, either a callable or a valid string.
        If a callable, then it should behave similarly to either
        :func:`sklearn.metrics.pairwise_distances` or to
        :func:`sklearn.metrics.pairwise.pairwise_kernels`.
        If a string, then it should be either one of the keys in
        :py:attr:`sklearn.metrics.pairwise.PAIRED_DISTANCES` one of the keys in
        :py:attr:`sklearn.metrics.pairwise.PAIRWISE_KERNEL_FUNCTIONS`, or "gaussian",
        which will use a gaussian kernel with an adaptively selected bandwidth.
        It is recommended to use kernels (e.g. "gaussian") with kernel-based
        hsic test and distances (e.g. "euclidean") with all other tests.

    n_components : int or None (default=None)
        Number of embedding dimensions. If None, the optimal embedding
        dimensions are found by the Zhu and Godsi algorithm.
        See :func:`~graspologic.embed.selectSVD` for more information.
        This argument is ignored if ``input_graph`` is False.

    n_bootstraps : int (default=200)
        Number of bootstrap iterations for the backend hypothesis test.
        See :class:`hyppo.ksample.KSample` for more information.

    workers : int (default=1)
        Number of workers to use. If more than 1, parallelizes the code.
        Supply -1 to use all cores available to the Process.

    size_correction : bool (default=True)
        Ignored when the two graphs have the same number of vertices. The test
        degrades in validity as the number of vertices of the two graphs
        diverge from each other, unless a correction is performed.

        - True
            Whenever the two graphs have different numbers of vertices,
            estimates the plug-in estimator for the variance and uses it to
            correct the embedding of the larger graph.
        - False
            Does not perform any modifications (not recommended).

    pooled : bool (default=False)
        Ignored whenever the two graphs have the same number of vertices or
        ``size_correction`` is set to False. In order to correct the adjacency
        spectral embedding used in the test, it is needed to estimate the
        variance for each of the latent position estimates in the larger graph,
        which requires to compute different sample moments. These moments can
        be computed either over the larger graph (False), or over both graphs
        (True). Setting it to True should not affect the behavior of the test
        under the null hypothesis, but it is not clear whether it has more
        power or less power under which alternatives. Generally not recomended,
        as it is untested and included for experimental purposes.

    align_type : str, {'sign_flips' (default), 'seedless_procrustes'} or None
        Random dot product graphs have an inherent non-identifiability,
        associated with their latent positions. Thus, two embeddings of
        different graphs may not be orthogonally aligned. Without this accounted
        for, two embeddings of different graphs may appear different, even
        if the distributions of the true latent positions are the same.
        There are several options in terms of how this can be addresssed:

        - 'sign_flips'
            A simple heuristic that flips the signs of one of the embeddings,
            if the medians of the two embeddings in that dimension differ from
            each other. See :class:`graspologic.align.SignFlips` for more
            information on this procedure. In the limit, this is guaranteed to
            lead to a valid test, as long as matrix :math:`X^T X`, where
            :math:`X` is the latent positions does not have repeated non-zero
            eigenvalues. This may, however, result in an invalid test in the
            finite sample case if the some eigenvalues are same or close.
        - 'seedless_procrustes'
            An algorithm that learns an orthogonal alignment matrix. This
            procedure is slower than sign flips, but is guaranteed to yield a
            valid test in the limit, and also makes the test more valid in some
            finite sample cases, in which the eigenvalues are very close to
            each other. See :class:`graspologic.align.SignFlips` for more information
            on the procedure.
        - None
            Do not use any alignment technique. This is strongly not
            recommended, as it may often result in a test that is not valid.

    align_kws : dict
        Keyword arguments for the aligner of choice, either
        :class:`graspologic.align.SignFlips` or
        :class:`graspologic.align.SeedlessProcrustes`, depending on the ``align_type``.
        See respective classes for more information.

    input_graph : bool (default=True)
        Flag whether to expect two full graphs, or the embeddings.

        - True
            This function expects graphs, either as NetworkX graph objects
            or as adjacency matrices, provided as ndarrays of size (n, n) and
            (m, m). They will be embedded using adjacency spectral embeddings.
        - False
            This function expects adjacency spectral embeddings of the graphs,
            they must be ndarrays of size (n, d) and (m, d), where
            d must be same. n_components attribute is ignored in this case.

    Returns
    ----------
    p_value : float
        The overall p value from the test.

    sample_T_statistic : float
        The observed difference between the embedded latent positions of the
        two input graphs.

    misc_stats : dictionary
        A collection of other statistics obtained from the latent position test

        - null_distribution : ndarray, shape (n_bootstraps,)
            The distribution of T statistics generated under the null.

        - n_components : int
            Number of embedding dimensions.

        - Q : array, size (d, d)
            Final orthogonal matrix, used to modify ``X``.

    References
    ----------
    .. [1] Tang, M., Athreya, A., Sussman, D. L., Lyzinski, V., & Priebe, C. E. (2017).
        "A nonparametric two-sample hypothesis testing problem for random graphs."
        Bernoulli, 23(3), 1599-1630.

    .. [2] Panda, S., Palaniappan, S., Xiong, J., Bridgeford, E., Mehta, R., Shen, C., & Vogelstein, J. (2019).
        "hyppo: A Comprehensive Multivariate Hypothesis Testing Python Package."
        arXiv:1907.02088.

    .. [3] Alyakin, A. A., Agterberg, J., Helm, H. S., Priebe, C. E. (2020).
       "Correcting a Nonparametric Two-sample Graph Hypothesis Test for Graphs with Different Numbers of Vertices"
       arXiv:2008.09434

    """

    # check test argument
    if not isinstance(test, str):
        msg = "test must be a str, not {}".format(type(test))
        raise TypeError(msg)
    elif test not in _VALID_TESTS:
        msg = "Unknown test {}. Valid tests are {}".format(test, _VALID_TESTS)
        raise ValueError(msg)
    # metric argument is checked when metric_func_ is instantiated
    # check n_components argument
    if n_components is not None:
        if not isinstance(n_components, int):
            msg = "n_components must be an int, not {}.".format(
                type(n_components))
            raise TypeError(msg)
    # check n_bootstraps argument
    if not isinstance(n_bootstraps, int):
        msg = "n_bootstraps must be an int, not {}".format(type(n_bootstraps))
        raise TypeError(msg)
    elif n_bootstraps < 0:
        msg = "{} is invalid number of bootstraps, must be non-negative"
        raise ValueError(msg.format(n_bootstraps))
    # check workers argument
    if not isinstance(workers, int):
        msg = "workers must be an int, not {}".format(type(workers))
        raise TypeError(msg)
    # check size_correction argument
    if not isinstance(size_correction, bool):
        msg = "size_correction must be a bool, not {}".format(
            type(size_correction))
        raise TypeError(msg)
    # check pooled argument
    if not isinstance(pooled, bool):
        msg = "pooled must be a bool, not {}".format(type(pooled))
        raise TypeError(msg)
    # check align_type argument
    if (not isinstance(align_type, str)) and (align_type is not None):
        msg = "align_type must be a string or None, not {}".format(
            type(align_type))
        raise TypeError(msg)
    align_types_supported = ["sign_flips", "seedless_procrustes", None]
    if align_type not in align_types_supported:
        msg = "supported align types are {}".format(align_types_supported)
        raise ValueError(msg)
    # check align_kws argument
    if not isinstance(align_kws, dict):
        msg = "align_kws must be a dictionary of keyword arguments, not {}".format(
            type(align_kws))
        raise TypeError(msg)
    # check input_graph argument
    if not isinstance(input_graph, bool):
        msg = "input_graph must be a bool, not {}".format(type(input_graph))
        raise TypeError(msg)

    if input_graph:
        A1 = import_graph(A1)
        A2 = import_graph(A2)

        X1_hat, X2_hat = _embed(A1, A2, n_components)
    else:
        # check for nx objects, since they are castable to arrays,
        # but we don't want that
        if not isinstance(A1, np.ndarray):
            msg = (
                f"Embedding of the first graph is of type {type(A1)}, not "
                "np.ndarray. If input_graph is False, the inputs need to be "
                "adjacency spectral embeddings, with shapes (n, d) and "
                "(m, d), passed as np.ndarrays.")
            raise TypeError(msg)
        if not isinstance(A2, np.ndarray):
            msg = (
                f"Embedding of the second graph is of type {type(A2)}, not an "
                "array. If input_graph is False, the inputs need to be "
                "adjacency spectral embeddings, with shapes (n, d) and "
                "(m, d), passed as np.ndarrays.")
            raise TypeError(msg)

        if A1.ndim != 2:
            msg = (
                "Embedding array of the first graph does not have two dimensions. "
                "If input_graph is False, the inputs need to be adjacency "
                "spectral embeddings, with shapes (n, d) and (m, d)")
            raise ValueError(msg)
        if A2.ndim != 2:
            msg = (
                "Embedding array of the second graph does not have two dimensions. "
                "If input_graph is False, the inputs need to be adjacency "
                "spectral embeddings, with shapes (n, d) and (m, d)")
            raise ValueError(msg)
        if A1.shape[1] != A2.shape[1]:
            msg = ("Two input embeddings have different number of components. "
                   "If input_graph is False, the inputs need to be adjacency "
                   "spectral embeddings, with shapes (n, d) and (m, d)")
            raise ValueError(msg)

        # checking for inf values
        X1_hat = check_array(A1)
        X2_hat = check_array(A2)

    if align_type == "sign_flips":
        aligner = SignFlips(**align_kws)
        X1_hat = aligner.fit_transform(X1_hat, X2_hat)
        Q = aligner.Q_
    elif align_type == "seedless_procrustes":
        aligner = SeedlessProcrustes(**align_kws)
        X1_hat = aligner.fit_transform(X1_hat, X2_hat)
        Q = aligner.Q_
    else:
        Q = np.identity(X1_hat.shape[0])

    if size_correction:
        X1_hat, X2_hat = _sample_modified_ase(X1_hat, X2_hat, pooled=pooled)

    metric_func_ = _instantiate_metric_func(metric=metric, test=test)
    test_obj = KSample(test, compute_distance=metric_func_)

    data = test_obj.test(X1_hat,
                         X2_hat,
                         reps=n_bootstraps,
                         workers=workers,
                         auto=False)

    null_distribution = test_obj.indep_test.null_dist

    misc_stats = {
        "null_distribution": null_distribution,
        "n_components": n_components,
        "Q": Q,
    }
    sample_T_statistic = data[0]
    p_value = data[1]

    return ldt_result(p_value, sample_T_statistic, misc_stats)
예제 #4
0
# make plots look pretty
sns.set(color_codes=True, style="white", context="talk", font_scale=1)

# look at the simulation
plt.figure(figsize=(5, 5))
for sim in sims:
    plt.scatter(sim[:, 0], sim[:, 1])
plt.xticks([])
plt.yticks([])
sns.despine(left=True, bottom=True, right=True)
plt.show()

# run k-sample test on the provided simulations. Note that *sims just unpacks the list
# we got containing our simulated data
stat, pvalue = KSample(indep_test="Dcorr").test(*sims)
print(stat, pvalue)

########################################################################################
# This was a general use case for the test, but there are a number of intricacies that
# depend on the type of independence test chosen. Those same parameters can be modified
# in this class. For a full list of the parameters, see the desired test in
# :mod:`hyppo.independence` and for examples on how to use it, see :ref:`indep`.

########################################################################################
# Distance (and Kernel) Equivalencies
# --------------------------------------------
#
# It turns out that a number of test statistics are multiples of one another and so,
# their p-values are equivalent to the above :ref:`nonpar manova`. `[1]`_ goes through
# the distance and kernel equivalencies and `[2]`_ goes through the independence and
예제 #5
0
    def __init__(
        self,
        test="dcorr",
        metric="euclidean",
        n_components=None,
        n_bootstraps=200,
        workers=1,
        size_correction=True,
    ):

        if not isinstance(test, str):
            msg = "test must be a str, not {}".format(type(test))
            raise TypeError(msg)
        elif test not in _VALID_TESTS:
            msg = "Unknown test {}. Valid tests are {}".format(
                test, _VALID_TESTS)
            raise ValueError(msg)

        if not isinstance(metric, str) and not callable(metric):
            msg = "Metric must be str or callable, not {}".format(type(metric))
            raise TypeError(msg)
        elif metric not in _VALID_METRICS and not callable(metric):
            msg = "Unknown metric {}. Valid metrics are {}, or a callable".format(
                metric, _VALID_METRICS)
            raise ValueError(msg)

        if n_components is not None:
            if not isinstance(n_components, int):
                msg = "n_components must be an int, not {}.".format(
                    type(n_components))
                raise TypeError(msg)

        if not isinstance(n_bootstraps, int):
            msg = "n_bootstraps must be an int, not {}".format(
                type(n_bootstraps))
            raise TypeError(msg)
        elif n_bootstraps < 0:
            msg = "{} is invalid number of bootstraps, must be non-negative"
            raise ValueError(msg.format(n_bootstraps))

        if not isinstance(workers, int):
            msg = "workers must be an int, not {}".format(type(workers))
            raise TypeError(msg)

        if not isinstance(size_correction, bool):
            msg = "size_correction must be a bool, not {}".format(
                type(size_correction))
            raise TypeError(msg)

        super().__init__(n_components=n_components)

        if callable(metric):
            metric_func = metric
        else:
            if metric in _VALID_DISTANCES:
                if test == "hsic":
                    msg = (
                        f"{test} is a kernel-based test, but {metric} "
                        "is a distance. results may not be optimal. it is "
                        "recomended to use either a different test or one of "
                        f"the kernels: {_VALID_KERNELS} as a metric.")
                    warnings.warn(msg, UserWarning)

                def metric_func(X, Y=None, metric=metric, workers=None):
                    return pairwise_distances(X,
                                              Y,
                                              metric=metric,
                                              n_jobs=workers)

            elif metric == "gaussian":
                if test != "hsic":
                    msg = (
                        f"{test} is a distance-based test, but {metric} "
                        "is a kernel. results may not be optimal. it is "
                        "recomended to use either a hisc as a test or one of "
                        f"the distances: {_VALID_DISTANCES} as a metric.")
                    warnings.warn(msg, UserWarning)
                metric_func = gaussian
            else:
                if test != "hsic":
                    msg = (
                        f"{test} is a distance-based test, but {metric} "
                        "is a kernel. results may not be optimal. it is "
                        "recomended to use either a hisc as a test or one of "
                        f"the distances: {_VALID_DISTANCES} as a metric.")
                    warnings.warn(msg, UserWarning)

                def metric_func(X, Y=None, metric=metric, workers=None):
                    return pairwise_kernels(X,
                                            Y,
                                            metric=metric,
                                            n_jobs=workers)

        self.test = KSample(test, compute_distance=metric_func)
        self.n_bootstraps = n_bootstraps
        self.workers = workers
        self.size_correction = size_correction
예제 #6
0
def run_dcorr(data1, data2):
    ksamp = KSample("Dcorr", compute_distance=euclidean)
    stat, pval = ksamp.test(data1, data2, auto=True)
    return stat, pval
예제 #7
0
        "x", "y", "z"
    ]].values
    return (in_axon, in_dend, out_axon, out_dend)


names = ["Axon input", "Dendrite input", "Axon output", "Dendrite output"]
syn_groups1 = split_in_out(label1_inputs, label1_outputs)
syn_groups2 = split_in_out(label2_inputs, label2_outputs)

result_df = pd.DataFrame(index=names,
                         columns=["pval", "stat", "n_sample1", "n_sample2"],
                         dtype="float64")

run_test = True
print("Running dcorr...")
ksamp = KSample("Dcorr")
for i, n in enumerate(names):
    print(n)
    data1 = syn_groups1[i]
    print(data1.shape)
    data2 = syn_groups2[i]
    print(data2.shape)
    if run_test:
        stat, pval = ksamp.test(data1, data2, auto=True)
        result_df.loc[n, "pval"] = pval
    else:
        stat = ksamp._statistic(data1, data2)
    result_df.loc[n, "stat"] = stat
    result_df.loc[n, "n_sample1"] = len(data1)
    result_df.loc[n, "n_sample2"] = len(data2)
    print()
예제 #8
0
class LatentDistributionTest(BaseInference):
    """
    Two-sample hypothesis test for the problem of determining whether two random
    dot product graphs have the same distributions of latent positions.

    This test can operate on two graphs where there is no known matching
    between the vertices of the two graphs, or even when the number of vertices
    is different. Currently, testing is only supported for undirected graphs.

    Read more in the :ref:`tutorials <inference_tutorials>`

    Parameters
    ----------
    test : str
        Backend hypothesis test to use, one of ["cca", "dcorr", "hhg", "rv", "hsic", "mgc"].
        These tests are typically used for independence testing, but here they
        are used for a two-sample hypothesis test on the latent positions of
        two graphs. See :class:`hyppo.ksample.KSample` for more information.

    metric : str or function, (default="gaussian")
        Distance or a kernel metric to use, either a callable or a valid string.
        If a callable, then it should behave similarly to either
        :func:`sklearn.metrics.pairwise_distances` or to
        :func:`sklearn.metrics.pairwise.pairwise_kernels`.
        If a string, then it should be either one of the keys in either
        `sklearn.metrics.pairwise.PAIRED_DISTANCES` or in
        `sklearn.metrics.pairwise.PAIRWISE_KERNEL_FUNCTIONS`, or "gaussian",
        which will use a gaussian kernel with an adaptively selected bandwidth.
        It is recommended to use kernels (e.g. "gaussian") with kernel-based
        hsic test and distances (e.g. "euclidean") with all other tests.

    n_components : int or None, optional (default=None)
        Number of embedding dimensions. If None, the optimal embedding
        dimensions are found by the Zhu and Godsi algorithm.
        See :func:`~graspy.embed.selectSVD` for more information.

    n_bootstraps : int (default=200)
        Number of bootstrap iterations for the backend hypothesis test.
        See :class:`hyppo.ksample.KSample` for more information.

    workers : int, optional (default=1)
        Number of workers to use. If more than 1, parallelizes the code.
        Supply -1 to use all cores available to the Process.

    size_correction: bool (default=True)
        Ignored when the two graphs have the same number of vertices. The test degrades
        in validity as the number of vertices of the two graphs diverge from each other,
        unless a correction is performed.
        If True, when the two graphs have different numbers of vertices, estimates
        the plug-in estimator for the variance and uses it to correct the
        embedding of the larger graph.
        If False, does not perform any modifications (not recommended).

    Attributes
    ----------
    null_distribution_ : ndarray, shape (n_bootstraps, )
        The distribution of T statistics generated under the null.

    sample_T_statistic_ : float
        The observed difference between the embedded latent positions of the two
        input graphs.

    p_value_ : float
        The overall p value from the test.

    References
    ----------
    .. [1] Tang, M., Athreya, A., Sussman, D. L., Lyzinski, V., & Priebe, C. E. (2017).
        "A nonparametric two-sample hypothesis testing problem for random graphs."
        Bernoulli, 23(3), 1599-1630.

    .. [2] Panda, S., Palaniappan, S., Xiong, J., Bridgeford, E., Mehta, R., Shen, C., & Vogelstein, J. (2019).
        "hyppo: A Comprehensive Multivariate Hypothesis Testing Python Package."
        arXiv:1907.02088.

    .. [3] Varjavand, B., Arroyo, J., Tang, M., Priebe, C., and Vogelstein, J. (2019).
       "Improving Power of 2-Sample Random Graph Tests with Applications in Connectomics"
       arXiv:1911.02741

    .. [4] Alyakin, A., Agterberg, J., Helm, H., Priebe, C. (2020)
       "Correcting a Nonparametric Two-sample Graph Hypothesis test for Differing Orders"
       TODO cite the arXiv whenever possible
    """
    def __init__(
        self,
        test="dcorr",
        metric="euclidean",
        n_components=None,
        n_bootstraps=200,
        workers=1,
        size_correction=True,
    ):

        if not isinstance(test, str):
            msg = "test must be a str, not {}".format(type(test))
            raise TypeError(msg)
        elif test not in _VALID_TESTS:
            msg = "Unknown test {}. Valid tests are {}".format(
                test, _VALID_TESTS)
            raise ValueError(msg)

        if not isinstance(metric, str) and not callable(metric):
            msg = "Metric must be str or callable, not {}".format(type(metric))
            raise TypeError(msg)
        elif metric not in _VALID_METRICS and not callable(metric):
            msg = "Unknown metric {}. Valid metrics are {}, or a callable".format(
                metric, _VALID_METRICS)
            raise ValueError(msg)

        if n_components is not None:
            if not isinstance(n_components, int):
                msg = "n_components must be an int, not {}.".format(
                    type(n_components))
                raise TypeError(msg)

        if not isinstance(n_bootstraps, int):
            msg = "n_bootstraps must be an int, not {}".format(
                type(n_bootstraps))
            raise TypeError(msg)
        elif n_bootstraps < 0:
            msg = "{} is invalid number of bootstraps, must be non-negative"
            raise ValueError(msg.format(n_bootstraps))

        if not isinstance(workers, int):
            msg = "workers must be an int, not {}".format(type(workers))
            raise TypeError(msg)

        if not isinstance(size_correction, bool):
            msg = "size_correction must be a bool, not {}".format(
                type(size_correction))
            raise TypeError(msg)

        super().__init__(n_components=n_components)

        if callable(metric):
            metric_func = metric
        else:
            if metric in _VALID_DISTANCES:
                if test == "hsic":
                    msg = (
                        f"{test} is a kernel-based test, but {metric} "
                        "is a distance. results may not be optimal. it is "
                        "recomended to use either a different test or one of "
                        f"the kernels: {_VALID_KERNELS} as a metric.")
                    warnings.warn(msg, UserWarning)

                def metric_func(X, Y=None, metric=metric, workers=None):
                    return pairwise_distances(X,
                                              Y,
                                              metric=metric,
                                              n_jobs=workers)

            elif metric == "gaussian":
                if test != "hsic":
                    msg = (
                        f"{test} is a distance-based test, but {metric} "
                        "is a kernel. results may not be optimal. it is "
                        "recomended to use either a hisc as a test or one of "
                        f"the distances: {_VALID_DISTANCES} as a metric.")
                    warnings.warn(msg, UserWarning)
                metric_func = gaussian
            else:
                if test != "hsic":
                    msg = (
                        f"{test} is a distance-based test, but {metric} "
                        "is a kernel. results may not be optimal. it is "
                        "recomended to use either a hisc as a test or one of "
                        f"the distances: {_VALID_DISTANCES} as a metric.")
                    warnings.warn(msg, UserWarning)

                def metric_func(X, Y=None, metric=metric, workers=None):
                    return pairwise_kernels(X,
                                            Y,
                                            metric=metric,
                                            n_jobs=workers)

        self.test = KSample(test, compute_distance=metric_func)
        self.n_bootstraps = n_bootstraps
        self.workers = workers
        self.size_correction = size_correction

    def _embed(self, A1, A2):
        if self.n_components is None:
            num_dims1 = select_dimension(A1)[0][-1]
            num_dims2 = select_dimension(A2)[0][-1]
            self.n_components = max(num_dims1, num_dims2)

        ase = AdjacencySpectralEmbed(n_components=self.n_components)
        X1_hat = ase.fit_transform(A1)
        X2_hat = ase.fit_transform(A2)

        if isinstance(X1_hat, tuple) and isinstance(X2_hat, tuple):
            X1_hat = np.concatenate(X1_hat, axis=-1)
            X2_hat = np.concatenate(X2_hat, axis=-1)
        elif isinstance(X1_hat, tuple) ^ isinstance(X2_hat, tuple):
            msg = ("input graphs do not have same directedness. "
                   "consider symmetrizing the directed graph.")
            raise ValueError(msg)

        return X1_hat, X2_hat

    def _sample_modified_ase(self, X, Y, pooled=False):
        N, M = len(X), len(Y)

        # return if graphs are same order, else else ensure X the larger graph.
        if N == M:
            return X, Y
        elif M > N:
            reverse_order = True
            X, Y = Y, X
            N, M = M, N
        else:
            reverse_order = False

        # estimate the central limit theorem variance
        if pooled:
            # TODO unclear whether using pooled estimator provides more power.
            # TODO this should be investigated. should not matter under null.
            two_samples = np.concatenate([X, Y], axis=0)
            get_sigma = _fit_plug_in_variance_estimator(two_samples)
        else:
            get_sigma = _fit_plug_in_variance_estimator(X)
        X_sigmas = get_sigma(X) * (N - M) / (N * M)

        # increase the variance of X by sampling from the asy dist
        X_sampled = np.zeros(X.shape)
        # TODO may be parallelized, but requires keeping track of random state
        for i in range(N):
            X_sampled[i, :] = X[i, :] + stats.multivariate_normal.rvs(
                cov=X_sigmas[i])

        # return the embeddings in the appropriate order
        return (Y, X_sampled) if reverse_order else (X_sampled, Y)

    def fit(self, A1, A2):
        """
        Fits the test to the two input graphs

        Parameters
        ----------
        A1, A2 : nx.Graph, nx.DiGraph, nx.MultiDiGraph, nx.MultiGraph, np.ndarray
            The two graphs to run a hypothesis test on.

        Returns
        -------
        self
        """
        A1 = import_graph(A1)
        A2 = import_graph(A2)

        X1_hat, X2_hat = self._embed(A1, A2)
        X1_hat, X2_hat = _median_sign_flips(X1_hat, X2_hat)

        if self.size_correction:
            X1_hat, X2_hat = self._sample_modified_ase(X1_hat, X2_hat)

        data = self.test.test(X1_hat,
                              X2_hat,
                              reps=self.n_bootstraps,
                              workers=self.workers,
                              auto=False)

        self.null_distribution_ = self.test.indep_test.null_dist
        self.sample_T_statistic_ = data[0]
        self.p_value_ = data[1]

        return self
예제 #9
0
class LatentDistributionTest(BaseInference):
    """Two-sample hypothesis test for the problem of determining whether two random
    dot product graphs have the same distributions of latent positions.

    This test can operate on two graphs where there is no known matching
    between the vertices of the two graphs, or even when the number of vertices
    is different. Currently, testing is only supported for undirected graphs.

    Read more in the :ref:`tutorials <inference_tutorials>`

    Parameters
    ----------
    test : str (default="hsic")
        Backend hypothesis test to use, one of ["cca", "dcorr", "hhg", "rv", "hsic", "mgc"].
        These tests are typically used for independence testing, but here they
        are used for a two-sample hypothesis test on the latent positions of
        two graphs. See :class:`hyppo.ksample.KSample` for more information.

    metric : str or function (default="gaussian")
        Distance or a kernel metric to use, either a callable or a valid string.
        If a callable, then it should behave similarly to either
        :func:`sklearn.metrics.pairwise_distances` or to
        :func:`sklearn.metrics.pairwise.pairwise_kernels`.
        If a string, then it should be either one of the keys in either
        `sklearn.metrics.pairwise.PAIRED_DISTANCES` or in
        `sklearn.metrics.pairwise.PAIRWISE_KERNEL_FUNCTIONS`, or "gaussian",
        which will use a gaussian kernel with an adaptively selected bandwidth.
        It is recommended to use kernels (e.g. "gaussian") with kernel-based
        hsic test and distances (e.g. "euclidean") with all other tests.

    n_components : int or None (default=None)
        Number of embedding dimensions. If None, the optimal embedding
        dimensions are found by the Zhu and Godsi algorithm.
        See :func:`~graspy.embed.selectSVD` for more information.
        This argument is ignored if input_graph=False.

    n_bootstraps : int (default=200)
        Number of bootstrap iterations for the backend hypothesis test.
        See :class:`hyppo.ksample.KSample` for more information.

    workers : int (default=1)
        Number of workers to use. If more than 1, parallelizes the code.
        Supply -1 to use all cores available to the Process.

    size_correction: bool (default=True)
        Ignored when the two graphs have the same number of vertices. The test
        degrades in validity as the number of vertices of the two graphs
        diverge from each other, unless a correction is performed.

        - True
            Whenever the two graphs have different numbers of vertices,
            estimates the plug-in estimator for the variance and uses it to
            correct the embedding of the larger graph.
        - False
            Does not perform any modifications (not recommended).

    pooled: bool (default=False)
        Ignored whenever the two graphs have the same number of vertices or
        size_correction is set to False. In order to correct the adjacency
        spectral embedding used in the test, it is needed to estimate the
        variance for each of the latent position estimates in the larger graph,
        which requires to compute different sample moments. These moments can
        be computed either over the larger graph (False), or over both graphs
        (True). Setting it to True should not affect the behavior of the test
        under the null hypothesis, but it is not clear whether it has more
        power or less power under which alternatives. Generally not recomended,
        as it is untested and included for experimental purposes.

    input_graph : bool (default=True)
        Flag whether to expect two full graphs, or the embeddings.

        - True
            .fit and .fit_predict() expect graphs, either as NetworkX graph objects
            or as adjacency matrices, provided as ndarrays of size (n, n) and
            (m, m). They will be embedded using adjacency spectral embeddings.
        - False
            .fit() and .fit_predict() expect adjacency spectral embeddings of
            the graphs, they must be ndarrays of size (n, d) and (m, d), where
            d must be same. n_components attribute is ignored in this case.

    Attributes
    ----------
    null_distribution_ : ndarray, shape (n_bootstraps, )
        The distribution of T statistics generated under the null.

    sample_T_statistic_ : float
        The observed difference between the embedded latent positions of the
        two input graphs.

    p_value_ : float
        The overall p value from the test.

    References
    ----------
    .. [1] Tang, M., Athreya, A., Sussman, D. L., Lyzinski, V., & Priebe, C. E. (2017).
        "A nonparametric two-sample hypothesis testing problem for random graphs."
        Bernoulli, 23(3), 1599-1630.

    .. [2] Panda, S., Palaniappan, S., Xiong, J., Bridgeford, E., Mehta, R., Shen, C., & Vogelstein, J. (2019).
        "hyppo: A Comprehensive Multivariate Hypothesis Testing Python Package."
        arXiv:1907.02088.

    .. [3] Alyakin, A., Agterberg, J., Helm, H., Priebe, C. (2020).
       "Correcting a Nonparametric Two-sample Graph Hypothesis Test for Graphs with Different Numbers of Vertices"
       arXiv:2008.09434

    """

    def __init__(
        self,
        test="dcorr",
        metric="euclidean",
        n_components=None,
        n_bootstraps=200,
        workers=1,
        size_correction=True,
        pooled=False,
        input_graph=True,
    ):

        if not isinstance(test, str):
            msg = "test must be a str, not {}".format(type(test))
            raise TypeError(msg)
        elif test not in _VALID_TESTS:
            msg = "Unknown test {}. Valid tests are {}".format(test, _VALID_TESTS)
            raise ValueError(msg)

        if not isinstance(metric, str) and not callable(metric):
            msg = "Metric must be str or callable, not {}".format(type(metric))
            raise TypeError(msg)
        elif metric not in _VALID_METRICS and not callable(metric):
            msg = "Unknown metric {}. Valid metrics are {}, or a callable".format(
                metric, _VALID_METRICS
            )
            raise ValueError(msg)

        if n_components is not None:
            if not isinstance(n_components, int):
                msg = "n_components must be an int, not {}.".format(type(n_components))
                raise TypeError(msg)

        if not isinstance(n_bootstraps, int):
            msg = "n_bootstraps must be an int, not {}".format(type(n_bootstraps))
            raise TypeError(msg)
        elif n_bootstraps < 0:
            msg = "{} is invalid number of bootstraps, must be non-negative"
            raise ValueError(msg.format(n_bootstraps))

        if not isinstance(workers, int):
            msg = "workers must be an int, not {}".format(type(workers))
            raise TypeError(msg)

        if not isinstance(size_correction, bool):
            msg = "size_correction must be a bool, not {}".format(type(size_correction))
            raise TypeError(msg)

        if not isinstance(pooled, bool):
            msg = "pooled must be a bool, not {}".format(type(pooled))
            raise TypeError(msg)

        if not isinstance(input_graph, bool):
            msg = "input_graph must be a bool, not {}".format(type(input_graph))
            raise TypeError(msg)

        super().__init__(n_components=n_components)

        if callable(metric):
            metric_func = metric
        else:
            if metric in _VALID_DISTANCES:
                if test == "hsic":
                    msg = (
                        f"{test} is a kernel-based test, but {metric} "
                        "is a distance. results may not be optimal. it is "
                        "recomended to use either a different test or one of "
                        f"the kernels: {_VALID_KERNELS} as a metric."
                    )
                    warnings.warn(msg, UserWarning)

                def metric_func(X, Y=None, metric=metric, workers=None):
                    return pairwise_distances(X, Y, metric=metric, n_jobs=workers)

            elif metric == "gaussian":
                if test != "hsic":
                    msg = (
                        f"{test} is a distance-based test, but {metric} "
                        "is a kernel. results may not be optimal. it is "
                        "recomended to use either a hisc as a test or one of "
                        f"the distances: {_VALID_DISTANCES} as a metric."
                    )
                    warnings.warn(msg, UserWarning)
                metric_func = gaussian
            else:
                if test != "hsic":
                    msg = (
                        f"{test} is a distance-based test, but {metric} "
                        "is a kernel. results may not be optimal. it is "
                        "recomended to use either a hisc as a test or one of "
                        f"the distances: {_VALID_DISTANCES} as a metric."
                    )
                    warnings.warn(msg, UserWarning)

                def metric_func(X, Y=None, metric=metric, workers=None):
                    return pairwise_kernels(X, Y, metric=metric, n_jobs=workers)

        self.test = KSample(test, compute_distance=metric_func)
        self.n_bootstraps = n_bootstraps
        self.workers = workers
        self.size_correction = size_correction
        self.pooled = pooled
        self.input_graph = input_graph

    def _embed(self, A1, A2):
        if self.n_components is None:
            num_dims1 = select_dimension(A1)[0][-1]
            num_dims2 = select_dimension(A2)[0][-1]
            self.n_components = max(num_dims1, num_dims2)

        ase = AdjacencySpectralEmbed(n_components=self.n_components)
        X1_hat = ase.fit_transform(A1)
        X2_hat = ase.fit_transform(A2)

        if isinstance(X1_hat, tuple) and isinstance(X2_hat, tuple):
            X1_hat = np.concatenate(X1_hat, axis=-1)
            X2_hat = np.concatenate(X2_hat, axis=-1)
        elif isinstance(X1_hat, tuple) ^ isinstance(X2_hat, tuple):
            msg = (
                "input graphs do not have same directedness. "
                "consider symmetrizing the directed graph."
            )
            raise ValueError(msg)

        return X1_hat, X2_hat

    def _sample_modified_ase(self, X, Y, pooled=False):
        N, M = len(X), len(Y)

        # return if graphs are same order, else else ensure X the larger graph.
        if N == M:
            return X, Y
        elif M > N:
            reverse_order = True
            X, Y = Y, X
            N, M = M, N
        else:
            reverse_order = False

        # estimate the central limit theorem variance
        if pooled:
            two_samples = np.concatenate([X, Y], axis=0)
            get_sigma = _fit_plug_in_variance_estimator(two_samples)
        else:
            get_sigma = _fit_plug_in_variance_estimator(X)
        X_sigmas = get_sigma(X) * (N - M) / (N * M)

        # increase the variance of X by sampling from the asy dist
        X_sampled = np.zeros(X.shape)
        # TODO may be parallelized, but requires keeping track of random state
        for i in range(N):
            X_sampled[i, :] = X[i, :] + stats.multivariate_normal.rvs(cov=X_sigmas[i])

        # return the embeddings in the appropriate order
        return (Y, X_sampled) if reverse_order else (X_sampled, Y)

    def fit(self, A1, A2):
        """
        Fits the test to the two input graphs

        Parameters
        ----------
        A1, A2 : variable (see description)
            The two graphs, or their embeddings to run a hypothesis test on.
            Expected variable type and shape depends on input_graph attribute:

            - input_graph=True
                expects two unembedded graphs either as NetworkX graph objects, or as
                two np.ndarrays, representing the adjacency matrices. In this
                case will be embedded using adjacency spectral embedding.
            - input_graph-False
                expects two already embedded graphs. In this case they must be
                arrays of shape (n, d) and (m, d), where d, the number of
                components, must be shared.

            Note that regardless of how the graphs are passed, they need not
            have the same number of vertices.

        Returns
        -------
        self
        """
        if self.input_graph:
            A1 = import_graph(A1)
            A2 = import_graph(A2)

            X1_hat, X2_hat = self._embed(A1, A2)
        else:
            # check for nx objects, since they are castable to arrays,
            # but we don't want that
            if not isinstance(A1, np.ndarray):
                msg = (
                    f"Embedding of the first graph is of type {type(A1)}, not "
                    "np.ndarray. If input_graph is False, the inputs need to be "
                    "adjacency spectral embeddings, with shapes (n, d) and "
                    "(m, d), passed as np.ndarrays."
                )
                raise TypeError(msg)
            if not isinstance(A2, np.ndarray):
                msg = (
                    f"Embedding of the second graph is of type {type(A2)}, not an "
                    "array. If input_graph is False, the inputs need to be "
                    "adjacency spectral embeddings, with shapes (n, d) and "
                    "(m, d), passed as np.ndarrays."
                )
                raise TypeError(msg)

            if A1.ndim != 2:
                msg = (
                    "Embedding array of the first graph does not have two dimensions. "
                    "If input_graph is False, the inputs need to be adjacency "
                    "spectral embeddings, with shapes (n, d) and (m, d)"
                )
                raise ValueError(msg)
            if A2.ndim != 2:
                msg = (
                    "Embedding array of the second graph does not have two dimensions. "
                    "If input_graph is False, the inputs need to be adjacency "
                    "spectral embeddings, with shapes (n, d) and (m, d)"
                )
                raise ValueError(msg)
            if A1.shape[1] != A2.shape[1]:
                msg = (
                    "Two input embeddings have different number of components. "
                    "If input_graph is False, the inputs need to be adjacency "
                    "spectral embeddings, with shapes (n, d) and (m, d)"
                )
                raise ValueError(msg)

            # checking for inf values
            X1_hat = check_array(A1)
            X2_hat = check_array(A2)

        X1_hat, X2_hat = _median_sign_flips(X1_hat, X2_hat)

        if self.size_correction:
            X1_hat, X2_hat = self._sample_modified_ase(
                X1_hat, X2_hat, pooled=self.pooled
            )

        data = self.test.test(
            X1_hat, X2_hat, reps=self.n_bootstraps, workers=self.workers, auto=False
        )

        self.null_distribution_ = self.test.indep_test.null_dist
        self.sample_T_statistic_ = data[0]
        self.p_value_ = data[1]

        return self

    def fit_predict(self, A1, A2):
        """
        Fits the test to the two input graphs and returns the p-value

        Parameters
        ----------
        A1, A2 : variable (see description)
            The two graphs, or their embeddings to run a hypothesis test on.
            Expected variable type and shape depends on input_graph attribute:

            - input_graph=True
                expects two unembedded graphs either as NetworkX graph objects, or as
                two np.ndarrays, representing the adjacency matrices. In this
                case will be embedded using adjacency spectral embedding.
            - input_graph-False
                expects two already embedded graphs. In this case they must be
                arrays of shape (n, d) and (m, d), where d, the number of
                components, must be shared.

            Note that regardless of how the graphs are passed, they need not to
            have the same number of vertices.


        Returns
        ------
        p_value_ : float
            The overall p value from the test
        """
        # abstract method overwritten in order to have a custom doc string
        self.fit(A1, A2)
        return self.p_value_
예제 #10
0
    in_dend = inputs[inputs["postsynaptic_type"] == "dend"][["x", "y",
                                                             "z"]].values
    out_axon = outputs[outputs["presynaptic_type"] == "axon"][["x", "y",
                                                               "z"]].values
    out_dend = outputs[outputs["presynaptic_type"] == "dend"][["x", "y",
                                                               "z"]].values
    return (in_axon, in_dend, out_axon, out_dend)


names = ["Axon input", "Dendrite input", "Axon output", "Dendrite output"]
syn_groups1 = split_in_out(label1_inputs, label1_outputs)
syn_groups2 = split_in_out(label2_inputs, label2_outputs)

stat_series = pd.Series(index=names)

ksamp = KSample("Dcorr")
for i, n in enumerate(names):
    data1 = syn_groups1[i]
    data2 = syn_groups2[i]
    stat = ksamp._statistic(data1, data2)
    stat_series[n] = stat

print(stat_series)
# plot_vars = np.array(["x", "y", "z"])

# def plot_connectors(data, Z, x, y, ax, mins, maxs):
#     sns.scatterplot(
#         data=data,
#         y=plot_vars[y],
#         x=plot_vars[x],
#         s=3,
def main_mv_test_single_site(data_dir, out_dir, out_filename, n_threads, reps,
                             sites):
    df_results = pd.read_pickle(
        os.path.join(data_dir, "results_df_single_site.pkl"))
    data_files = [
        f for f in os.listdir(data_dir) if ('single-site-style' in f)
    ]

    main_measures = ['AUC', 'APR', 'ECE']
    training_years = [2008, 2009, 2010]

    columns = []
    for target in targets:
        for representation in representations:
            for modeltype in models:
                for indep_test in [
                        'base_rows', 'base_cols', 'data_rows', 'data_cols'
                ] + independent_tests:
                    columns.append(
                        (target, representation, modeltype, indep_test))

    ind = [(site, yr, mnth) for site in sites for yr in year_range
           for mnth in month_intervals]
    ind = pd.MultiIndex.from_tuples(ind, names=('hospital', 'year', 'month'))
    cols = pd.MultiIndex.from_tuples(columns,
                                     names=('target', 'representation',
                                            'model', 'indep_test'))
    indep_test_df = pd.DataFrame(index=ind, columns=cols)

    for indep_test in independent_tests:
        for site in sites:
            for target in targets:
                for rep in representations:
                    for f in data_files:
                        if (target in f) and ("_" + rep + "_"
                                              in f) and ("_" + site + "_"
                                                         in f):
                            print(target, rep)
                            for modeltype in models:
                                print(modeltype)

                                data_key = "X_train_" + "-".join([
                                    str(i) for i in training_years
                                ]) + "_" + modeltype.upper()
                                try:
                                    X1 = pd.read_hdf(os.path.join(data_dir, f),
                                                     key=data_key)
                                except KeyError as ke:
                                    data_key = "X_train_" + "-".join(
                                        [str(i) for i in training_years])
                                    X1 = pd.read_hdf(os.path.join(data_dir, f),
                                                     key=data_key)

                                print('X1_shape:', X1.shape)

                                indep_test_df.loc[idx[site, :, :], idx[
                                    target, rep, modeltype,
                                    'base_rows']] = X1.shape[0]
                                indep_test_df.loc[idx[site, :, :], idx[
                                    target, rep, modeltype,
                                    'base_cols']] = X1.shape[1]

                                for year in year_range:
                                    for month in month_intervals:
                                        if ~df_results.loc[(
                                                site, year, month
                                        ), idx[target, modeltype, rep,
                                               main_measures]].isna().all():
                                            ## if any non-null measures is available for (site, year, month) index
                                            print(indep_test, site, target,
                                                  rep, modeltype, year, month)
                                            data_key = "X_test_" + str(
                                                year) + "_" + "-".join([
                                                    str(i) for i in
                                                    [month - 1, month]
                                                ]) + "_" + modeltype.upper()
                                            try:
                                                X2 = pd.read_hdf(os.path.join(
                                                    data_dir, f),
                                                                 key=data_key)
                                            except KeyError as ke:
                                                data_key = "X_test_" + str(
                                                    year) + "_" + "-".join([
                                                        str(i) for i in
                                                        [month - 1, month]
                                                    ])
                                                X2 = pd.read_hdf(os.path.join(
                                                    data_dir, f),
                                                                 key=data_key)
                                            print('X2_shape:', X2.shape)
                                            indep_test_df.loc[
                                                (site, year, month),
                                                idx[target, rep, modeltype,
                                                    'data_rows']] = X2.shape[0]
                                            indep_test_df.loc[
                                                (site, year, month),
                                                idx[target, rep, modeltype,
                                                    'data_cols']] = X2.shape[1]

                                            t0 = time.time()
                                            np.random.seed(0)
                                            stat, pvalue = KSample(
                                                indep_test).test(
                                                    X1.values,
                                                    X2.values,
                                                    workers=n_threads,
                                                    reps=reps,
                                                    auto=True)
                                            t1 = time.time()
                                            print("runtime=", str((t1 - t0)),
                                                  "seconds")
                                            print(
                                                "stat, pval= {:0.3f}, {:0.3f}".
                                                format(stat, pvalue))

                                            indep_test_df.loc[
                                                (site, year, month),
                                                idx[target, rep, modeltype,
                                                    indep_test]] = pvalue
                                            indep_test_df.to_csv(
                                                os.path.join(
                                                    out_dir,
                                                    out_filename + ".csv"))
                                            indep_test_df.to_pickle(
                                                os.path.join(
                                                    out_dir,
                                                    out_filename + ".pkl"))
                                            print('*' * 30)
def latent_distribution_test(
    A1: GraphRepresentation,
    A2: GraphRepresentation,
    test: LdtTestType = "dcorr",
    metric: Union[str, Callable] = "euclidean",
    n_components: Optional[int] = None,
    n_bootstraps: int = 500,
    random_state: Optional[Union[int, np.random.RandomState,
                                 np.random.Generator]] = None,
    workers: Optional[int] = None,
    size_correction: bool = True,
    pooled: bool = False,
    align_type: Optional[Literal["sign_flips",
                                 "seedless_procrustes"]] = "sign_flips",
    align_kws: Dict[str, Any] = {},
    input_graph: bool = True,
) -> ldt_result:
    """Two-sample hypothesis test for the problem of determining whether two random
    dot product graphs have the same distributions of latent positions.

    This test can operate on two graphs where there is no known matching
    between the vertices of the two graphs, or even when the number of vertices
    is different. Currently, testing is only supported for undirected graphs.

    Read more in the `Latent Distribution Two-Graph Testing Tutorial
    <https://microsoft.github.io/graspologic/tutorials/inference/latent_distribution_test.html>`_

    Parameters
    ----------
    A1, A2 : variable (see description of 'input_graph')
        The two graphs, or their embeddings to run a hypothesis test on.
        Expected variable type and shape depends on input_graph attribute

    test : str (default="dcorr")
        Backend hypothesis test to use, one of ["cca", "dcorr", "hhg", "rv", "hsic", "mgc"].
        These tests are typically used for independence testing, but here they
        are used for a two-sample hypothesis test on the latent positions of
        two graphs. See :class:`hyppo.ksample.KSample` for more information.

    metric : str or function (default="euclidean")
        Distance or a kernel metric to use, either a callable or a valid string.
        Kernel metrics (e.g. "gaussian") must be used with kernel-based HSIC test
        and distances (e.g. "euclidean") with all other tests. If a callable,
        then it should behave similarly to either
        :func:`sklearn.metrics.pairwise_distances` or to
        :func:`sklearn.metrics.pairwise.pairwise_kernels`.

        Valid strings for distance ``metric`` are, as defined in
        :func:`sklearn.metrics.pairwise_distances`,

            - From scikit-learn: [``"euclidean"``, ``"cityblock"``, ``"cosine"``,
              ``"l1"``, ``"l2"``, ``"manhattan"``].
            - From scipy.spatial.distance: [``"braycurtis"``, ``"canberra"``,
              ``"chebyshev"``, ``"correlation"``, ``"dice"``, ``"hamming"``,
              ``"jaccard"``, ``"kulsinski"``, ``"mahalanobis"``, ``"minkowski"``,
              ``"rogerstanimoto"``, ``"russellrao"``, ``"seuclidean"``,
              ``"sokalmichener"``, ``"sokalsneath"``, ``"sqeuclidean"``,
              ``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for
              details on these metrics.

        Valid strings for kernel ``metric`` are, as defined in
        :func:`sklearn.metrics.pairwise.pairwise_kernels`,

            [``"additive_chi2"``, ``"chi2"``, ``"linear"``, ``"poly"``,
            ``"polynomial"``, ``"rbf"``,
            ``"laplacian"``, ``"sigmoid"``, ``"cosine"``]

        Note ``"rbf"`` and ``"gaussian"`` are the same metric, which will use
        an adaptively selected bandwidth.

    n_components : int or None (default=None)
        Number of embedding dimensions. If None, the optimal embedding
        dimensions are found by the Zhu and Godsi algorithm.
        See :func:`~graspologic.embed.select_svd` for more information.
        This argument is ignored if ``input_graph`` is False.

    n_bootstraps : int (default=200)
        Number of bootstrap iterations for the backend hypothesis test.
        See :class:`hyppo.ksample.KSample` for more information.

    random_state : {None, int, `~np.random.RandomState`, `~np.random.Generator`}
        This parameter defines the object to use for drawing random
        variates.
        If `random_state` is ``None`` the `~np.random.RandomState` singleton is
        used.
        If `random_state` is an int, a new ``RandomState`` instance is used,
        seeded with `random_state`.
        If `random_state` is already a ``RandomState`` or ``Generator``
        instance, then that object is used.
        Default is None.

    workers : int or None (default=None)
        Number of workers to use. If more than 1, parallelizes the code.
        Supply -1 to use all cores available. None is a marker for
        'unset' that will be interpreted as ``workers=1`` (sequential execution) unless
        the call is performed under a Joblib parallel_backend context manager that sets
        another value for ``workers``. See :class:joblib.Parallel for more details.

    size_correction : bool (default=True)
        Ignored when the two graphs have the same number of vertices. The test
        degrades in validity as the number of vertices of the two graphs
        diverge from each other, unless a correction is performed.

        - True
            Whenever the two graphs have different numbers of vertices,
            estimates the plug-in estimator for the variance and uses it to
            correct the embedding of the larger graph.
        - False
            Does not perform any modifications (not recommended).

    pooled : bool (default=False)
        Ignored whenever the two graphs have the same number of vertices or
        ``size_correction`` is set to False. In order to correct the adjacency
        spectral embedding used in the test, it is needed to estimate the
        variance for each of the latent position estimates in the larger graph,
        which requires to compute different sample moments. These moments can
        be computed either over the larger graph (False), or over both graphs
        (True). Setting it to True should not affect the behavior of the test
        under the null hypothesis, but it is not clear whether it has more
        power or less power under which alternatives. Generally not recomended,
        as it is untested and included for experimental purposes.

    align_type : str, {'sign_flips' (default), 'seedless_procrustes'} or None
        Random dot product graphs have an inherent non-identifiability,
        associated with their latent positions. Thus, two embeddings of
        different graphs may not be orthogonally aligned. Without this accounted
        for, two embeddings of different graphs may appear different, even
        if the distributions of the true latent positions are the same.
        There are several options in terms of how this can be addresssed:

        - 'sign_flips'
            A simple heuristic that flips the signs of one of the embeddings,
            if the medians of the two embeddings in that dimension differ from
            each other. See :class:`graspologic.align.SignFlips` for more
            information on this procedure. In the limit, this is guaranteed to
            lead to a valid test, as long as matrix :math:`X^T X`, where
            :math:`X` is the latent positions does not have repeated non-zero
            eigenvalues. This may, however, result in an invalid test in the
            finite sample case if the some eigenvalues are same or close.
        - 'seedless_procrustes'
            An algorithm that learns an orthogonal alignment matrix. This
            procedure is slower than sign flips, but is guaranteed to yield a
            valid test in the limit, and also makes the test more valid in some
            finite sample cases, in which the eigenvalues are very close to
            each other. See :class:`graspologic.align.SignFlips` for more information
            on the procedure.
        - None
            Do not use any alignment technique. This is strongly not
            recommended, as it may often result in a test that is not valid.

    align_kws : dict
        Keyword arguments for the aligner of choice, either
        :class:`graspologic.align.SignFlips` or
        :class:`graspologic.align.SeedlessProcrustes`, depending on the ``align_type``.
        See respective classes for more information.

    input_graph : bool (default=True)
        Flag whether to expect two full graphs, or the embeddings.

        - True
            This function expects graphs, either as NetworkX graph objects
            or as adjacency matrices, provided as ndarrays of size (n, n) and
            (m, m). They will be embedded using adjacency spectral embeddings.
        - False
            This function expects adjacency spectral embeddings of the graphs,
            they must be ndarrays of size (n, d) and (m, d), where
            d must be same. n_components attribute is ignored in this case.

    Returns
    ----------
    stat : float
        The observed difference between the embedded latent positions of the
        two input graphs.

    pvalue : float
        The overall p value from the test.

    misc_dict : dictionary
        A collection of other statistics obtained from the latent position test

        - null_distribution : ndarray, shape (n_bootstraps,)
            The distribution of T statistics generated under the null.

        - n_components : int
            Number of embedding dimensions.

        - Q : array, size (d, d)
            Final orthogonal matrix, used to modify ``X``.

    References
    ----------
    .. [1] Tang, M., Athreya, A., Sussman, D. L., Lyzinski, V., & Priebe, C. E. (2017).
        "A nonparametric two-sample hypothesis testing problem for random graphs."
        Bernoulli, 23(3), 1599-1630.

    .. [2] Panda, S., Palaniappan, S., Xiong, J., Bridgeford, E., Mehta, R., Shen, C., & Vogelstein, J. (2019).
        "hyppo: A Comprehensive Multivariate Hypothesis Testing Python Package."
        arXiv:1907.02088.

    .. [3] Alyakin, A. A., Agterberg, J., Helm, H. S., Priebe, C. E. (2020).
       "Correcting a Nonparametric Two-sample Graph Hypothesis Test for Graphs with Different Numbers of Vertices"
       arXiv:2008.09434

    """

    # check test argument
    if not isinstance(test, str):
        msg = "test must be a str, not {}".format(type(test))
        raise TypeError(msg)
    elif test not in _VALID_TESTS:
        msg = "Unknown test {}. Valid tests are {}".format(test, _VALID_TESTS)
        raise ValueError(msg)

    # check metric argument
    if not isinstance(metric, str) and not callable(metric):
        msg = "Metric must be str or callable, not {}".format(type(metric))
        raise TypeError(msg)
    elif metric not in _VALID_METRICS and not callable(metric):
        msg = "Unknown metric {}. Valid metrics are {}, or a callable".format(
            metric, _VALID_METRICS)
        raise ValueError(msg)

    if metric in _VALID_DISTANCES:
        if test == "hsic":
            msg = (f"{test} is a kernel-based test, but {metric} "
                   "is a distance. Use a different test or one of "
                   f"the kernels: {_VALID_KERNELS} as a metric.")
            raise ValueError(msg)
    elif metric in _VALID_KERNELS:
        if test != "hsic":
            msg = (f"{test} is a distance-based test, but {metric} "
                   "is a kernel. Use either a HSIC as the test or one of "
                   f"the distances: {_VALID_DISTANCES} as a metric.")
            raise ValueError(msg)

    # check n_components argument
    if n_components is not None:
        if not isinstance(n_components, int):
            msg = "n_components must be an int, not {}.".format(
                type(n_components))
            raise TypeError(msg)

    # check n_bootstraps argument
    if not isinstance(n_bootstraps, int):
        msg = "n_bootstraps must be an int, not {}".format(type(n_bootstraps))
        raise TypeError(msg)
    elif n_bootstraps < 0:
        msg = "{} is invalid number of bootstraps, must be non-negative"
        raise ValueError(msg.format(n_bootstraps))

    # check workers argument
    if workers is not None and not isinstance(workers, (int, np.integer)):
        msg = "workers must be an int or None, not {}".format(type(workers))
        raise TypeError(msg)

    # check size_correction argument
    if not isinstance(size_correction, bool):
        msg = "size_correction must be a bool, not {}".format(
            type(size_correction))
        raise TypeError(msg)

    # check pooled argument
    if not isinstance(pooled, bool):
        msg = "pooled must be a bool, not {}".format(type(pooled))
        raise TypeError(msg)

    # check align_type argument
    if (not isinstance(align_type, str)) and (align_type is not None):
        msg = "align_type must be a string or None, not {}".format(
            type(align_type))
        raise TypeError(msg)
    align_types_supported = ["sign_flips", "seedless_procrustes", None]
    if align_type not in align_types_supported:
        msg = "supported align types are {}".format(align_types_supported)
        raise ValueError(msg)

    # check align_kws argument
    if not isinstance(align_kws, dict):
        msg = "align_kws must be a dictionary of keyword arguments, not {}".format(
            type(align_kws))
        raise TypeError(msg)

    # check input_graph argument
    if not isinstance(input_graph, bool):
        msg = "input_graph must be a bool, not {}".format(type(input_graph))
        raise TypeError(msg)

    if input_graph:
        A1 = import_graph(A1)
        A2 = import_graph(A2)

        X1_hat, X2_hat = _embed(A1, A2, n_components)
    else:
        # check for nx objects, since they are castable to arrays,
        # but we don't want that
        if not isinstance(A1, np.ndarray):
            msg = (
                f"Embedding of the first graph is of type {type(A1)}, not "
                "np.ndarray. If input_graph is False, the inputs need to be "
                "adjacency spectral embeddings, with shapes (n, d) and "
                "(m, d), passed as np.ndarrays.")
            raise TypeError(msg)
        if not isinstance(A2, np.ndarray):
            msg = (
                f"Embedding of the second graph is of type {type(A2)}, not an "
                "array. If input_graph is False, the inputs need to be "
                "adjacency spectral embeddings, with shapes (n, d) and "
                "(m, d), passed as np.ndarrays.")
            raise TypeError(msg)

        if A1.ndim != 2:
            msg = (
                "Embedding array of the first graph does not have two dimensions. "
                "If input_graph is False, the inputs need to be adjacency "
                "spectral embeddings, with shapes (n, d) and (m, d)")
            raise ValueError(msg)
        if A2.ndim != 2:
            msg = (
                "Embedding array of the second graph does not have two dimensions. "
                "If input_graph is False, the inputs need to be adjacency "
                "spectral embeddings, with shapes (n, d) and (m, d)")
            raise ValueError(msg)
        if A1.shape[1] != A2.shape[1]:
            msg = ("Two input embeddings have different number of components. "
                   "If input_graph is False, the inputs need to be adjacency "
                   "spectral embeddings, with shapes (n, d) and (m, d)")
            raise ValueError(msg)

        # checking for inf values
        X1_hat = check_array(A1)
        X2_hat = check_array(A2)

    if align_type == "sign_flips":
        aligner = SignFlips(**align_kws)
        X1_hat = aligner.fit_transform(X1_hat, X2_hat)
        Q = aligner.Q_
    elif align_type == "seedless_procrustes":
        aligner = SeedlessProcrustes(**align_kws)
        X1_hat = aligner.fit_transform(X1_hat, X2_hat)
        Q = aligner.Q_
    else:
        Q = np.identity(X1_hat.shape[0])

    if size_correction:
        X1_hat, X2_hat = _sample_modified_ase(X1_hat,
                                              X2_hat,
                                              workers=workers,
                                              random_state=random_state,
                                              pooled=pooled)

    test_obj = KSample(test, compute_distkern=metric)

    data = test_obj.test(X1_hat,
                         X2_hat,
                         reps=n_bootstraps,
                         workers=workers,
                         auto=False)

    null_distribution = test_obj.indep_test.null_dist

    misc_dict = {
        "null_distribution": null_distribution,
        "n_components": n_components,
        "Q": Q,
    }
    stat = data[0]
    pvalue = data[1]

    return ldt_result(stat, pvalue, misc_dict)
예제 #13
0
def run_dcorr(data1, data2):
    ksamp = KSample("Dcorr")
    stat, pval = ksamp.test(data1, data2, auto=True, workers=-1)
    return stat, pval