Exemplo n.º 1
0
def _validate_inputs(x: np.ndarray, y: np.ndarray, ties="average"):
    assert np.ndim(x) == 2 and np.ndim(y) == 2, "input data must be matrices"

    x = pseudo_obs(x, ties)
    y = pseudo_obs(y, ties)

    assert x.shape[1] == y.shape[1], "input data must have the same dimensions"
    return x, y, x.shape[1]
Exemplo n.º 2
0
def k_means(data: np.ndarray, n_clusters: int, n_dim: int, ties='average'):
    """
    Determines the GMC's parameters via K-means

    Parameters
    ----------
    data : np.ndarray
        Input data

    n_clusters : int
        Number of clusters (components)

    n_dim : int
        Number of dimension for each Gaussian distribution

    ties : { 'average', 'min', 'max', 'dense', 'ordinal' }, optional
        Specifies how ranks should be computed if there are ties in any of the coordinate samples. This is
        effective only if the data has not been converted to its pseudo observations form

    Returns
    -------
    GMCParam
        The GMC model's parameters
    """
    u = pseudo_obs(data, ties)
    km = KMeans(n_clusters, algorithm='full').fit(u)

    groups, prob = np.unique(km.labels_, return_counts=True)
    prob = prob / sum(prob)

    means = np.array([data[km.labels_ == g].mean(0) for g in groups])
    covs = np.array(
        [np.cov(data[km.labels_ == g], rowvar=False) for g in groups])

    return GMCParam(n_clusters, n_dim, prob, means, covs)
Exemplo n.º 3
0
    def _generate_new_copula(self):
        u_fit = (pseudo_obs(self._data, ties=self._fit_ties) if self._has_ties
                 and self._fit_ties != self._ties else deepcopy(self._u))

        cop = type(self._copula)(dim=self._data.shape[2])
        cop.fit(u_fit)
        return cop
Exemplo n.º 4
0
    def __init__(self, data: Union[pd.DataFrame, np.ndarray], ties: str, fit_ties):
        self.data = data.to_numpy() if isinstance(data, pd.DataFrame) else np.asarray(data)
        self.ties = ties
        self.fit_ties = fit_ties

        self.has_ties = False
        nrow, ncol = self.data.shape
        for i in range(ncol):
            if len(np.unique(self.data[:, i])) != nrow:
                self.has_ties = True
                break

        # data used for fitting the main copula
        self.pobs = pseudo_obs(self.data, ties=ties)

        # data used fo
        self.fitted_pobs = pseudo_obs(self.data, ties=fit_ties) if self.has_ties and ties != fit_ties else self.pobs
        self._duplicated_rank_array = np.sort(rank_data(self.data, 1), 0).astype(int) - 1
Exemplo n.º 5
0
def gof_t_stat(copula: AbstractCopula,
               data: np.ndarray,
               ties="average",
               *,
               to_pobs=True):
    """Computes the T Statistic of the copula"""
    if to_pobs:
        data = pseudo_obs(data, ties)

    cop_cdf = copula.cdf(data)
    emp_cdf = emp_dist_func(data, data, smoothing='none', ties=ties)
    return sum((emp_cdf - cop_cdf)**2)
Exemplo n.º 6
0
 def __init__(self,
              copula: AbstractCopula,
              data: Union[pd.DataFrame, np.ndarray],
              reps: int,
              ties="average",
              fit_ties="average"):
     self._copula = copula
     self._data = data.values if isinstance(
         data, pd.DataFrame) else np.asarray(data)
     self._has_ties = self._data_has_ties(self._data)
     self._reps = int(reps)
     self._fit_ties = fit_ties
     self._ties = ties
     self._u = pseudo_obs(self._data, ties=ties)
Exemplo n.º 7
0
    def pobs(data, ties='average'):
        """
        Compute the pseudo-observations for the given data matrix

        Parameters
        ----------
        data: {array_like, DataFrame}
            Random variates to be converted to pseudo-observations

        ties: { 'average', 'min', 'max', 'dense', 'ordinal' }, optional
            String specifying how ranks should be computed if there are ties in any of the coordinate samples

        Returns
        -------
        ndarray
            matrix or vector of the same dimension as `data` containing the pseudo observations
        """
        return pseudo_obs(data, ties)
Exemplo n.º 8
0
    def pobs(data: np.ndarray, ties='average'):
        """
        Compute the pseudo-observations for the given data matrix

        Parameters
        ----------
        data: (N, D) ndarray
            Random variates to be converted to pseudo-observations

        ties: { 'average', 'min', 'max', 'dense', 'ordinal' }, optional
            String specifying how ranks should be computed if there are ties in any of the coordinate samples

        Returns
        -------
        ndarray
            matrix or vector of the same dimension as `data` containing the pseudo observations
        """
        return pseudo_obs(data, ties)
Exemplo n.º 9
0
    def fit(self, data: Union[pd.DataFrame, np.ndarray], x0: Union[Collection[float], np.ndarray, GMCParam] = None,
            method: EstimateMethod = 'pem', optim_options: dict = None, ties: Ties = 'average', verbose=1,
            max_iter=3000,
            criteria: Criteria = 'GMCM', eps=1e-4):
        """
        Fit the copula with specified data

        Parameters
        ----------
        data
            Array of data used to fit copula. Usually, data should not be pseudo observations as this will
            skew the model parameters

        x0
            Initial starting point. If value is None, best starting point will be estimated

        method
            Method of fitting. Supported methods are: 'pem' - Expectation Maximization with pseudo log-likelihood,
            'kmeans' - K-means, 'sgd' - stochastic gradient descent

        optim_options : dict, optional
            Keyword arguments to pass into scipy.optimize.minimize. Only applicable for gradient-descent
            optimizations

        ties : { 'average', 'min', 'max', 'dense', 'ordinal' }, optional
            Specifies how ranks should be computed if there are ties in any of the coordinate samples. This is
            effective only if the data has not been converted to its pseudo observations form

        verbose:
            Log level for the estimator. The higher the number, the more verbose it is. 0 prints nothing.

        max_iter : int
            Maximum number of iterations

        criteria : { 'GMCM', 'GMM', 'Li' }
            The stopping criteria. Only applicable for Expectation Maximization (EM).  'GMCM' uses the absolute
            difference between the current and last based off the GMCM log likelihood, 'GMM' uses the absolute
            difference between the current and last based off the GMM log likelihood and 'Li' uses the stopping
            criteria defined by Li et. al. (2011)

        eps : float
            The epsilon value for which any absolute delta will mean that the model has converged

        Notes
        -----
        Maximizing the exact likelihood of GMCM is technically intractable using expectation maximization. The
        pseudo-likelihood

        See Also
        --------
        :code:`scipy.optimize.minimize`: the `scipy minimize <https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html#scipy.optimize.minimize>`_ function use for optimization
        """
        method = method.lower()

        if x0 is None:
            self.params = k_means(data, self.clusters, self.dim)
        elif isinstance(x0, GMCParam):
            self.params = x0
        else:
            self.params = GMCParam.from_vector(x0, self.clusters, self.dim)

        if method == 'pem':
            u = pseudo_obs(data, ties)
            self.params = expectation_maximization(u, self.params, max_iter, criteria, verbose, eps)
        elif method == 'sgd':
            u = pseudo_obs(data, ties)
            self.params = gradient_descent(u, self.params, max_iter=max_iter, **(optim_options or {}))
        elif method == 'kmeans':
            if x0 is not None:  # otherwise already fitted by default
                self.params = k_means(data, self.clusters, self.dim)
        else:
            raise GMCFitMethodError(f"Invalid method: {method}. Use one of (kmeans, pem, sgd)")

        self._fit_details["method"] = method
        return self
Exemplo n.º 10
0
def rad_sym_test(x, N=1000, ties: Ties = 'average'):
    r"""
    Test of Radial Symmetry for a Multivariate Copula.

    Test for assessing the radial symmetry of the underlying multivariate copula based on the empirical copula. The
    test statistic is a multivariate extension of the definition adopted in the first reference. An approximate
    p-value for the test statistic is obtained by means of a appropriate bootstrap which can take the presence of
    ties in the component series of the data into account; see the second reference.

    A random vector :math:`X` is called radially symmetric (for d = 1 simply symmetric) about :math:`a \in R^d` if
    :math:`X − a = a − X`, that is, if :math:`X − a` and :math:`a − X` are equal in distribution. In a hand-wavy
    manner, perhaps the consequence of the radial symmetry test is to verify if an elliptical copula should be used
    to fit the data as elliptical copulas are radial symmetric.

    Parameters
    ----------
    x: {array_like, pandas.DataFrame}
        A matrix like data structure

    N: int
        Number of bootstrap iterations to be used to simulate realizations of the test statistic under the null
        hypothesis

    ties
        String specifying how ranks should be computed if there are ties in any of the coordinate samples. Options
        include 'average', 'min', 'max', 'dense', 'ordinal'.

    Returns
    -------
    TestStatistic
        Test statistics for the radial symmetry test. The null hypothesis assumes that the vectors are radially
        symmetric. Thus a small p-value will indicate evidence against radial symmetry

    Examples
    --------
    >>> from copulae.datasets import load_danube
    >>> from copulae.gof import rad_sym_test

    >>> danube = load_danube()
    >>> test_stats = rad_sym_test(danube)
    >>> print(test_stats.p_value)

    A small p-value here indicates strong evidence against radial symmetry.

    References
    ----------
    Genest, C. and G. Nešlehová, J. (2014). On tests of radial symmetry for bivariate copulas. Statistical Papers 55,
    1107–1119.

    Kojadinovic, I. (2017). Some copula inference procedures adapted to the presence of ties. Computational Statistics
    and Data Analysis 112, 24–41, http://arxiv.org/abs/1609.05519.
    """
    x = np.asarray(x)

    assert isinstance(
        N, int
    ) and N >= 1, "number of replications for exchangeability test must be a positive integer"
    assert x.ndim == 2, "input data must be a 2-dimensional matrix"

    n, p = x.shape
    u = pseudo_obs(x, ties)

    s = rad_sym_test_stat(u.ravel('F'), n, p)

    has_ties = False
    for i in range(p):
        if len(np.unique(x[:, i])) != n:
            has_ties = True
            break

    ir = np.floor(rank_data(np.sort(u, 0), axis=1)).astype(int) - 1
    s0 = np.array([rad_sym_replicate(u, ir, n, p, has_ties) for _ in range(N)])

    return TestStatistic(
        s, (np.sum(s0 >= s) + 0.5) / (N + 1),
        "Test of radial symmetry based on the empirical copula")
def empirical_copula_loss(x, data, epsilon):
    pseudo_data = pseudo_obs(data)
    return np.fabs(np.mean(np.all(np.less_equal(pseudo_data, np.array([x] * pseudo_data.shape[1])), axis=1)
                           ) - 1 + epsilon)
Exemplo n.º 12
0
def U(residual_data):
    return pseudo_obs(residual_data)
Exemplo n.º 13
0
def exch_test(x, y, N=1000, m=0, ties='average'):
    r"""
    Test of Exchangeability for a Bivariate Copula.

    Test for assessing the exchangeability of the underlying bivariate copula based on the empirical copula.
    The test statistics are defined in the first two references. Approximate p-values for the test statistics are
    obtained by means of a multiplier technique if there are no ties in the component series of the bivariate
    data, or by means of an appropriate bootstrap otherwise.

    A random vector X is called exchangeable iff :math:`(X1, ..., Xd) = (X_{\pi(1)}, ..., X_{\pi(d)})`
    for any permutation :math:`(\pi(1), \pi(2), \dots, \pi(d))` of :math:`(1, \dots, d)`.

    A copula C is called exchangeable iff C is the distribution function of an exchangeable random vector
    (with uniform marginal distributions on [0, 1]). For such a copula
    :math:`C(u1, u2, ..., ud ) = C(u\pi(1), u\pi(2), ..., u\pi(d))` holds for any permutation
    :math:`(\pi(1), \pi(2), \dots, \pi(d))` of :math:`(1, \dots, d)`.

    Examples of exchangeable copulas:
        Gumbel, Clayton, and also the Gaussian copula :math:`C_P^{Ga}` and the t-Copula :math:`C_{ν,P}^t`, if
        P is an equicorrelation matrix, i.e. :math:`R = \rho J_d + (1 − \rho)I_d`. :math:`J_d \in R^{d×d}`
        is a matrix consisting only of ones, and :math:`I_d \in R^{d×d}` is the d-dimensional identity matrix.

    For bivariate exchangeable copulas we have:

    .. math::

        P(U_2 \leq u_2|U_1 = u_1) = P(U_1 \leq u_2|U_2 = u_1).

    Parameters
    ----------
    x: array_like
        first vector for the exchangeability test

    y: array_like
        second vector for the exchangeability test

    N: int
        Number of multiplier or bootstrap iterations to be used to simulate realizations of the test statistic under
        the null hypothesis.

    m: int
        If m = 0, integration in the Cramér–von Mises statistic is carried out with respect to the empirical copula.
        If m > 0, integration is carried out with respect to the Lebesgue measure and m specifies the size of the
        integration grid.

    ties: str, optional
        String specifying how ranks should be computed if there are ties in any of the coordinate samples. Options
        include 'average', 'min', 'max', 'dense', 'ordinal'.

    Returns
    -------
    TestStatistic
        Test statistics for the exchangeability test. The null hypothesis assumes that the vectors are exchangeable.
        Thus a small p-value will indicate evidence against exchangeability

    Examples
    --------
    >>> from copulae.datasets import load_danube
    >>> from copulae.gof import exch_test
    >>> danube = load_danube().values
    >>> test_stats = exch_test(danube[:, 0], danube[:, 1])
    >>> print(test_stats.p_value)

    A small p-value here indicates strong evidence against exchangeability

    References
    ----------
    Genest, C., G. Nešlehová, J. and Quessy, J.-F. (2012). Tests of symmetry for bivariate copulas. Annals of the
    Institute of Statistical Mathematics 64, 811–834.

    Kojadinovic, I. and Yan, J. (2012). A nonparametric test of exchangeability for extreme-value and left-tail
    decreasing bivariate copulas. The Scandinavian Journal of Statistics 39:3, 480–496.

    Kojadinovic, I. (2017). Some copula inference procedures adapted to the presence of ties. Computational Statistics
    and Data Analysis 112, 24–41, http://arxiv.org/abs/1609.05519.
    """
    x = pseudo_obs(x, ties)
    y = pseudo_obs(y, ties)
    u = np.vstack([x, y]).T

    assert isinstance(
        m,
        int) and m >= 0, "size of the integration grid must be an integer >= 0"
    assert x.ndim == 1 and y.ndim == 1, "x and y must be vectors. Exchangeability tests is bivariate"
    assert isinstance(
        N, int
    ) and N >= 1, "number of replications for exchangeability test must be a positive integer"

    n = len(u)
    if m > 0:
        xis = np.linspace(1 / m, 1 - 1 / m, m)
        g = np.stack([np.tile(xis, m), np.repeat(xis, m)]).T
        ng = m * m
    else:
        g = u
        ng = n

    s = exch_test_stat(u, g, n, ng)

    has_ties = len(np.unique(x)) != n or len(np.unique(y)) != n

    if has_ties:
        ir = np.floor(rank_data(np.sort(u, 0), axis=1)).astype(int) - 1
        s0 = np.asarray(
            [exch_replication(ir, u, g, n, m, ng) for _ in range(N)])

    else:
        s0 = exch_test_cn(u, g, n, ng, N)

    return TestStatistic(s, (np.sum(s0 >= s) + 0.5) / (N + 1),
                         "Test of exchangeability for bivariate copulas")