Exemplo n.º 1
0
    def __init__(self, y, x, w):

        # 1a. OLS --> \tilde{betas}
        ols = OLS.BaseOLS(y=y, x=x)
        self.n, self.k = ols.x.shape
        self.x = ols.x
        self.y = ols.y

        # 1b. GMM --> \tilde{\lambda1}
        moments = _momentsGM_Error(w, ols.u)
        lambda1 = optim_moments(moments)

        # 2a. OLS -->\hat{betas}
        xs = get_spFilter(w, lambda1, self.x)
        ys = get_spFilter(w, lambda1, self.y)
        ols2 = OLS.BaseOLS(y=ys, x=xs)

        # Output
        self.predy = spdot(self.x, ols2.betas)
        self.u = y - self.predy
        self.betas = np.vstack((ols2.betas, np.array([[lambda1]])))
        self.sig2 = ols2.sig2n
        self.e_filtered = self.u - lambda1 * w * self.u

        self.vm = self.sig2 * ols2.xtxi
        se_betas = np.sqrt(self.vm.diagonal())
        self._cache = {}
Exemplo n.º 2
0
def vif(reg):
    """
    Calculates the variance inflation factor for each independent variable.
    For the ease of indexing the results, the constant is currently
    included. This should be omitted when reporting the results to the
    output text. [Greene2003]_

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------    
    vif_result      : list of tuples
                      each tuple includes the vif and the tolerance, the
                      order of the variables corresponds to their order in
                      the reg.x matrix

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the variance inflation factor (VIF). 
    >>> testresult = diagnostics.vif(reg)

    Select the tuple for the income variable. 

    >>> incvif = testresult[1]

    Print the VIF for income. 

    >>> print("%12.12f"%incvif[0])
    1.333117497189

    Print the tolerance for income. 

    >>> print("%12.12f"%incvif[1])
    0.750121427487

    Repeat for the home value variable. 

    >>> hovalvif = testresult[2]
    >>> print("%12.12f"%hovalvif[0])
    1.333117497189
    >>> print("%12.12f"%hovalvif[1])
    0.750121427487

    """
    X = reg.x
    n, k = X.shape
    vif_result = []

    for j in range(k):
        Z = X.copy()
        Z = np.delete(Z, j, 1)
        y = X[:, j]
        import ols as OLS
        aux = OLS.BaseOLS(y, Z)
        mean_y = aux.mean_y
        utu = aux.utu
        ss_tot = sum((y - mean_y)**2)
        if ss_tot == 0:
            resj = pysal.MISSINGVALUE
        else:
            r2aux = 1 - utu / ss_tot
            tolj = 1 - r2aux
            vifj = 1 / tolj
            resj = (vifj, tolj)
        vif_result.append(resj)
    return vif_result
Exemplo n.º 3
0
def white(reg):
    """
    Calculates the White test to check for heteroscedasticity. [White1980]_

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    white_result    : dictionary
                      contains the statistic (white), degrees of freedom
                      (df) and the associated p-value (pvalue) for the
                      White test. 
    white           : float
                      scalar value for the White test statistic.
    df              : integer
                      degrees of freedom associated with the test
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with k df)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the White test for heteroscedasticity.

    >>> testresult = diagnostics.white(reg)

    Print the degrees of freedom for the test.

    >>> print testresult['df']
    5

    Print the test statistic.

    >>> print("%1.3f"%testresult['wh'])
    19.946

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0013

    """
    e = reg.u**2
    k = int(reg.k)
    n = int(reg.n)
    y = reg.y
    X = reg.x
    #constant = constant_check(X)

    # Check for constant, if none add one, see Greene 2003, pg. 222
    # if constant == False:
    #    X = np.hstack((np.ones((n,1)),X))

    # Check for multicollinearity in the X matrix
    ci = condition_index(reg)
    if ci > 30:
        white_result = "Not computed due to multicollinearity."
        return white_result

    # Compute cross-products and squares of the regression variables
    if type(X).__name__ == 'ndarray':
        A = np.zeros((n, (k * (k + 1)) // 2))
    elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix':
        # this is probably inefficient
        A = SP.lil_matrix((n, (k * (k + 1)) // 2))
    else:
        raise Exception, "unknown X type, %s" % type(X).__name__
    counter = 0
    for i in range(k):
        for j in range(i, k):
            v = spmultiply(X[:, i], X[:, j], False)
            A[:, counter] = v
            counter += 1

    # Append the original variables
    A = sphstack(X, A)  # note: this also converts a LIL to CSR
    n, k = A.shape

    # Check to identify any duplicate or constant columns in A
    omitcolumn = []
    for i in range(k):
        current = A[:, i]
        # remove all constant terms (will add a constant back later)
        if spmax(current) == spmin(current):
            omitcolumn.append(i)
            pass
        # do not allow duplicates
        for j in range(k):
            check = A[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)
    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed
    if type(A).__name__ == 'ndarray':
        A = np.delete(A, omitcolumn, 1)
    elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix':
        # this is probably inefficient
        keepcolumn = range(k)
        for i in omitcolumn:
            keepcolumn.remove(i)
        A = A[:, keepcolumn]
    else:
        raise Exception, "unknown A type, %s" % type(X).__name__
    A = sphstack(np.ones((A.shape[0], 1)), A)  # add a constant back in
    n, k = A.shape

    # Conduct the auxiliary regression and calculate the statistic
    import ols as OLS
    aux_reg = OLS.BaseOLS(e, A)
    aux_r2 = r2(aux_reg)
    wh = aux_r2 * n
    df = k - 1
    pvalue = chisqprob(wh, df)
    white_result = {'df': df, 'wh': wh, 'pvalue': pvalue}
    return white_result
Exemplo n.º 4
0
        if len(name_y) > 1 and isinstance(name_y, list):
            name_y = ''.join([i for i in name_y[0] if not i.isdigit()])
        if len(name_y) == 1 and isinstance(name_y, list):
            name_y = name_y[0]
    if name_x:
        if len(name_x) != k*T and len(name_x) != k:
                raise Exception("Names of columns in X must have exactly either k or k*t elements.")
        if len(name_x) > k:
            name_bigx = []
            for i in range(k):
                name_bigx.append(''.join([j for j in name_x[i*T] if not j.isdigit()]))
            name_x = name_bigx
       
    return bigy, bigx, name_y, name_x

ols = OLS.BaseOLS(y=y, x=x)
x, y, n, k, xtx = ols.x, ols.y, ols.n, ols.k, ols.xtx
N = w.n
T = y.shape[0]//N
moments, trace_w2 = _moments_kkp(w.sparse, ols.u, 0)
lambda1, sig_v = optim_moments(moments, all_par=True)
Tw = SP.kron(SP.identity(T),w.sparse)
ub = Tw.dot(ols.u)
ulu = ols.u - lambda1*ub
Q1 = SP.kron(np.ones((T,T))/T,SP.identity(N))
sig_1 = float(np.dot(ulu.T,Q1.dot(ulu))/N)
#print('initial_lamb_sig:',lambda1,sig_v,sig_1)
#print('theta:', 1 - np.sqrt(sig_v)/ np.sqrt(sig_1))
Xi_a = SP.diags([(sig_v*sig_v)/(T-1),sig_1*sig_1])
if full_weights:
    Tau = _get_Tau(w.sparse,trace_w2)