def chisquared(classFeatureMatrix):
    """
    Chi-squared statistic for a feature and a set of classes.
    Classes are indexed by rows (1st index) and feature values are indexed by columns.
    Entries are counts of the feature value given the class.
    """
    # counts of each class: sum the rows
    classcounts = classFeatureMatrix.sum(axis=1)
    
    # attribute counts: 
    valuecounts = classFeatureMatrix.sum(axis=0)
    
    # total count of observations
    total = sum(classcounts)
    
    # expected observations under the independence assumption
    expected = array([valuecounts]*len(classcounts),dtype='float')
    expected = transpose(transpose(expected)*classcounts/total)
    
    # chi-squared statistic: (obs-expected)^2/expected where obs is classFeatureMatrix
    chi = sum(((classFeatureMatrix - expected)**2)/expected)
    
    # degrees of freedom
    df = (expected.shape[0]-1)*(expected.shape[1]-1)
    
    # the p-value; 1 minus the cdf of chi-squared at the given statistic value
    return 1-chi2(df=df).cdf(chi)
示例#2
0
    def log_pdf_at_quantile(self, alphas):
        """
        Computes the log-pdf at a given 1d-vector of quantiles
        """
        chi2_instance = chi2(self.dimension)
        cuttoffs = chi2_instance.isf(1 - alphas)

        log_determinant_part = -sum(log(diag(self.L)))
        quadratic_part = -0.5 * cuttoffs
        const_part = -0.5 * len(self.L) * log(2 * pi)

        return const_part + log_determinant_part + quadratic_part
示例#3
0
 def log_pdf_at_quantile(self, alphas):
     """
     Computes the log-pdf at a given 1d-vector of quantiles
     """
     chi2_instance = chi2(self.dimension)
     cuttoffs = chi2_instance.isf(1 - alphas)
     
     log_determinant_part = -sum(log(diag(self.L)))
     quadratic_part = -0.5 * cuttoffs
     const_part = -0.5 * len(self.L) * log(2 * pi)
     
     return const_part + log_determinant_part + quadratic_part
示例#4
0
 def emp_quantiles(self, X, quantiles=arange(0.1, 1, 0.1)):
     # need inverse chi2 cdf with self.dimension degrees of freedom
     chi2_instance = chi2(self.dimension)
     cutoffs = chi2_instance.isf(1 - quantiles)
     # whitening
     D, U = eig(self.L.dot(self.L.T))
     D = D**(-0.5)
     W = (diag(D).dot(U.T).dot((X - self.mu).T)).T
     norms_squared = array([norm(w)**2 for w in W])
     results = zeros([len(quantiles)])
     for jj in range(0, len(quantiles)):
         results[jj] = mean(norms_squared < cutoffs[jj])
     return results
示例#5
0
 def emp_quantiles(self, X, quantiles=arange(0.1, 1, 0.1)):
     # need inverse chi2 cdf with self.dimension degrees of freedom
     chi2_instance = chi2(self.dimension)
     cutoffs = chi2_instance.isf(1 - quantiles)
     # whitening
     D, U = eig(self.L.dot(self.L.T))
     D = D ** (-0.5)
     W = (diag(D).dot(U.T).dot((X - self.mu).T)).T
     norms_squared = array([norm(w) ** 2 for w in W])
     results = zeros([len(quantiles)])
     for jj in range(0, len(quantiles)):
         results[jj] = mean(norms_squared < cutoffs[jj])
     return results
示例#6
0
def gaussian_pvalue(X, mu, cov, ndof=None):
    r"""calculates p-value for the assumptions of `x` originating from a
    multivariate Gaussian pdf with mean `mu` and covariance `cov`.
    It exploits the fact that the mahalobonis distance of `x`

    .. math::
        d^2 = (\vec x - \vec \mu)^\top \Sigma^{-1} (\vec x - \vec \mu)

    is :math:`\chi^2`-distributed with :math:`n_\mathrm{dof} = dim(\vec x)`,
    then the pvalue is :math:`\mathrm{cdf}_{\chi^2}(d^2)`.

    Parameters
    ----------
    X : numpy array, shape=(n_samples, n_dim) or (n_dim,)
        Sample to calculate the Mahalanobis distance for.

    mu : numpy array, shape=(n_dim)
        Mean of the Gaussian distribution

    cov : numpy array, shape=(n_dim, n_dim)
        Covariance matrix of the Gaussian distribution

    ndof : float
        Number of degrees of freedom for the chi2 distribution. If `None`,
        `n_dim` will be used.

    Returns
    -------
    pvals : numpy array, shape=(n_samples)
        The p-values
    """
    if X.ndim == 1:
        X = X.reshape(1, -1)
    dsquared = mahalanobis(X, mu, cov)
    if ndof is None:
        ndof = X.shape[1]
    return chi2(ndof).cdf(dsquared)
N = 1000
data = dgp(N, *truth)
y, X = data
Winv = Omegahat(beta, sigma_u, data)


def J(b, s, W, data):

    m = gN(b, s, data)  # Sample moments @ b, s
    N = data[0].shape[0]

    return N * m.T @ W @ m  # Scale by sample size


# Limiting distribution under the null
limiting_J = iid.chi2(1 * 2 - 2)

import scipy.optimize as optimize


def two_step_gmm(data):

    # First step uses identity weighting matrix
    W1 = np.eye(gj(1, 1, data).shape[1])
    x0 = [1, 1]

    def J2(params):
        b, s = params
        return J(b, s, W1, data)

    result = optimize.minimize(J2, x0)