Пример #1
0
    def __iter__(self):
        n = self.n
        k = self.k
        start = self.start
        if self.return_slice:
            for i in range(start, n-k):
                train_slice = slice(None, i, None)
                if self.kall:
                    test_slice = slice(i, i+k)
                else:
                    test_slice = slice(i+k-1, i+k)
                yield train_slice, test_slice

        else: #for compatibility with other iterators
            for i in range(start, n-k):
                train_index  = np.zeros(n, dtype=np.bool)
                train_index[:i] = True
                test_index  = np.zeros(n, dtype=np.bool)
                if self.kall:
                    test_index[i:i+k] = True # np.logical_not(test_index)
                else:
                    test_index[i+k-1:i+k] = True
                #or faster to return np.arange(i,i+k) ?
                #returning slice should be faster in this case
                yield train_index, test_index
Пример #2
0
    def test_generate_sample(self):
        process = ArmaProcess.from_coeffs([0.9])
        np.random.seed(12345)
        sample = process.generate_sample()
        np.random.seed(12345)
        expected = np.random.randn(100)
        for i in range(1, 100):
            expected[i] = 0.9 * expected[i - 1] + expected[i]
        assert_almost_equal(sample, expected)

        process = ArmaProcess.from_coeffs([1.6, -0.9])
        np.random.seed(12345)
        sample = process.generate_sample()
        np.random.seed(12345)
        expected = np.random.randn(100)
        expected[1] = 1.6 * expected[0] + expected[1]
        for i in range(2, 100):
            expected[i] = 1.6 * expected[i - 1] - 0.9 * expected[i - 2] + expected[i]
        assert_almost_equal(sample, expected)

        process = ArmaProcess.from_coeffs([1.6, -0.9])
        np.random.seed(12345)
        sample = process.generate_sample(burnin=100)
        np.random.seed(12345)
        expected = np.random.randn(200)
        expected[1] = 1.6 * expected[0] + expected[1]
        for i in range(2, 200):
            expected[i] = 1.6 * expected[i - 1] - 0.9 * expected[i - 2] + expected[i]
        assert_almost_equal(sample, expected[100:])


        np.random.seed(12345)
        sample = process.generate_sample(nsample=(100,5))
        assert_equal(sample.shape, (100,5))
    def test_ftest_pvalues(self):
        res = self.results
        use_t = res.use_t
        k_vars = len(res.params)
        # check default use_t
        pvals = [res.wald_test(np.eye(k_vars)[k], use_f=use_t).pvalue
                                                   for k in range(k_vars)]
        assert_allclose(pvals, res.pvalues, rtol=5e-10, atol=1e-25)

        # sutomatic use_f based on results class use_t
        pvals = [res.wald_test(np.eye(k_vars)[k]).pvalue
                                                   for k in range(k_vars)]
        assert_allclose(pvals, res.pvalues, rtol=5e-10, atol=1e-25)

        # label for pvalues in summary
        string_use_t = 'P>|z|' if use_t is False else 'P>|t|'
        summ = str(res.summary())
        assert_(string_use_t in summ)

        # try except for models that don't have summary2
        try:
            summ2 = str(res.summary2())
        except AttributeError:
            summ2 = None
        if summ2 is not None:
            assert_(string_use_t in summ2)
Пример #4
0
def varsim(coefs, intercept, sig_u, steps=100, initvalues=None, seed=None):
    """
    Simulate VAR(p) process, given coefficients and assuming Gaussian noise

    Parameters
    ----------
    coefs : ndarray
        Coefficients for the VAR lags of endog.
    intercept : None or ndarray 1-D (neqs,) or (steps, neqs)
        This can be either the intercept for each equation or an offset.
        If None, then the VAR process has a zero intercept.
        If intercept is 1-D, then the same (endog specific) intercept is added
        to all observations.
        If intercept is 2-D, then it is treated as an offset and is added as
        an observation specific intercept to the autoregression. In this case,
        the intercept/offset should have same number of rows as steps, and the
        same number of columns as endogenous variables (neqs).
    sig_u : ndarray
        Covariance matrix of the residuals or innovations.
        If sig_u is None, then an identity matrix is used.
    steps : None or int
        number of observations to simulate, this includes the initial
        observations to start the autoregressive process.
        If offset is not None, then exog of the model are used if they were
        provided in the model
    seed : None or integer
        If seed is not None, then it will be used with for the random
        variables generated by numpy.random.

    Returns
    -------
    endog_simulated : nd_array
        Endog of the simulated VAR process

    """
    rs = np.random.RandomState(seed=seed)
    rmvnorm = rs.multivariate_normal
    p, k, k = coefs.shape
    if sig_u is None:
        sig_u = np.eye(k)
    ugen = rmvnorm(np.zeros(len(sig_u)), sig_u, steps)
    result = np.zeros((steps, k))
    if intercept is not None:
        # intercept can be 2-D like an offset variable
        if np.ndim(intercept) > 1:
            if not len(intercept) == len(ugen):
                raise ValueError('2-D intercept needs to have length `steps`')
        # add intercept/offset also to intial values
        result += intercept
        result[p:] += ugen[p:]
    else:
        result[p:] = ugen[p:]

    # add in AR terms
    for t in range(p, steps):
        ygen = result[t]
        for j in range(p):
            ygen += np.dot(coefs[j], result[t-j-1])

    return result
Пример #5
0
def make_lag_names(names, lag_order, trendorder=1, exog=None):
    """
    Produce list of lag-variable names. Constant / trends go at the beginning

    Examples
    --------
    >>> make_lag_names(['foo', 'bar'], 2, 1)
    ['const', 'L1.foo', 'L1.bar', 'L2.foo', 'L2.bar']

    """
    lag_names = []
    if isinstance(names, string_types):
        names = [names]

    # take care of lagged endogenous names
    for i in range(1, lag_order + 1):
        for name in names:
            if not isinstance(name, string_types):
                name = str(name) # will need consistent unicode handling
            lag_names.append('L'+str(i)+'.'+name)

    # handle the constant name
    if trendorder != 0:
        lag_names.insert(0, 'const')
    if trendorder > 1:
        lag_names.insert(1, 'trend')
    if trendorder > 2:
        lag_names.insert(2, 'trend**2')
    if exog is not None:
        for i in range(exog.shape[1]):
            lag_names.insert(trendorder + i, "exog" + str(i))
    return lag_names
Пример #6
0
    def initialize(self, model):

        super(GlobalOddsRatio, self).initialize(model)

        if self.model.weights is not None:
            warnings.warn("weights not implemented for GlobalOddsRatio "
                          "cov_struct, using unweighted covariance estimate",
                          NotImplementedWarning)

        # Need to restrict to between-subject pairs
        cpp = []
        for v in model.endog_li:

            # Number of subjects in this group
            m = int(len(v) / self._ncut)
            i1, i2 = np.tril_indices(m, -1)

            cpp1 = {}
            for k1 in range(self._ncut):
                for k2 in range(k1 + 1):
                    jj = np.zeros((len(i1), 2), dtype=np.int64)
                    jj[:, 0] = i1 * self._ncut + k1
                    jj[:, 1] = i2 * self._ncut + k2
                    cpp1[(k2, k1)] = jj

            cpp.append(cpp1)

        self.cpp = cpp

        # Initialize the dependence parameters
        self.crude_or = self.observed_crude_oddsratio()
        if self.model.update_dep:
            self.dep_params = self.crude_or
Пример #7
0
    def _eigval_decomp_SZ(self, irf_resim):
        """
        Returns
        -------
        W: array of eigenvectors
        eigva: list of eigenvalues
        k: matrix indicating column # of largest eigenvalue for each c_i,j

        """
        neqs = self.neqs
        periods = self.periods

        cov_hold = np.zeros((neqs, neqs, periods, periods))
        for i in range(neqs):
            for j in range(neqs):
                cov_hold[i,j,:,:] = np.cov(irf_resim[:,1:,i,j],rowvar=0)

        W = np.zeros((neqs, neqs, periods, periods))
        eigva = np.zeros((neqs, neqs, periods, 1))
        k = np.zeros((neqs, neqs))

        for i in range(neqs):
            for j in range(neqs):
                W[i,j,:,:], eigva[i,j,:,0], k[i,j] = util.eigval_decomp(cov_hold[i,j,:,:])
        return W, eigva, k
Пример #8
0
def plot_full_acorr(acorr, fontsize=8, linewidth=8, xlabel=None,
                    err_bound=None):
    """

    Parameters
    ----------



    """
    import matplotlib.pyplot as plt

    config = MPLConfigurator()
    config.set_fontsize(fontsize)

    k = acorr.shape[1]
    fig, axes = plt.subplots(k, k, figsize=(10, 10), squeeze=False)

    for i in range(k):
        for j in range(k):
            ax = axes[i][j]
            acorr_plot(acorr[:, i, j], linewidth=linewidth,
                       xlabel=xlabel, ax=ax)

            if err_bound is not None:
                ax.axhline(err_bound, color='k', linestyle='--')
                ax.axhline(-err_bound, color='k', linestyle='--')

    adjust_subplots()
    config.revert()

    return fig
Пример #9
0
def levinson_durbin_nitime(s, order=10, isacov=False):
    '''Levinson-Durbin recursion for autoregressive processes

    '''
    #from nitime

##    if sxx is not None and type(sxx) == np.ndarray:
##        sxx_m = sxx[:order+1]
##    else:
##        sxx_m = ut.autocov(s)[:order+1]
    if isacov:
        sxx_m = s
    else:
        sxx_m = acovf(s)[:order+1]  #not tested

    phi = np.zeros((order+1, order+1), 'd')
    sig = np.zeros(order+1)
    # initial points for the recursion
    phi[1,1] = sxx_m[1]/sxx_m[0]
    sig[1] = sxx_m[0] - phi[1,1]*sxx_m[1]
    for k in range(2,order+1):
        phi[k,k] = (sxx_m[k]-np.dot(phi[1:k,k-1], sxx_m[1:k][::-1]))/sig[k-1]
        for j in range(1,k):
            phi[j,k] = phi[j,k-1] - phi[k,k]*phi[k-j,k-1]
        sig[k] = sig[k-1]*(1 - phi[k,k]**2)

    sigma_v = sig[-1]; arcoefs = phi[1:,-1]
    return sigma_v, arcoefs, pacf, phi  #return everything
Пример #10
0
def prob_quantize_cdf(binsx, binsy, cdf):
    '''quantize a continuous distribution given by a cdf

    Parameters
    ----------
    binsx : array_like, 1d
        binedges

    '''
    binsx = np.asarray(binsx)
    binsy = np.asarray(binsy)
    nx = len(binsx) - 1
    ny = len(binsy) - 1
    probs = np.nan * np.ones((nx, ny)) #np.empty(nx,ny)
    cdf_values = cdf(binsx[:,None], binsy)
    cdf_func = lambda x, y: cdf_values[x,y]
    for xind in range(1, nx+1):
        for yind in range(1, ny+1):
            upper = (xind, yind)
            lower = (xind-1, yind-1)
            #print upper,lower,
            probs[xind-1,yind-1] = prob_bv_rectangle(lower, upper, cdf_func)

    assert not np.isnan(probs).any()
    return probs
Пример #11
0
    def dataset(self, as_dict=False):
        """
        Returns a Python generator object for iterating over the dataset.


        Parameters
        ----------
        as_dict : bool, optional
            If as_dict is True, yield each row of observations as a dict.
            If False, yields each row of observations as a list.

        Returns
        -------
        Generator object for iterating over the dataset.  Yields each row of
        observations as a list by default.

        Notes
        -----
        If missing_values is True during instantiation of StataReader then
        observations with _StataMissingValue(s) are not filtered and should
        be handled by your applcation.
        """

        try:
            self._file.seek(self._data_location)
        except Exception:
            pass

        if as_dict:
            vars = lmap(str, self.variables())
            for i in range(len(self)):
                yield dict(zip(vars, self._next()))
        else:
            for i in range(self._header['nobs']):
                yield self._next()
Пример #12
0
def prob_quantize_cdf_old(binsx, binsy, cdf):
    '''quantize a continuous distribution given by a cdf

    old version without precomputing cdf values

    Parameters
    ----------
    binsx : array_like, 1d
        binedges

    '''
    binsx = np.asarray(binsx)
    binsy = np.asarray(binsy)
    nx = len(binsx) - 1
    ny = len(binsy) - 1
    probs = np.nan * np.ones((nx, ny)) #np.empty(nx,ny)
    for xind in range(1, nx+1):
        for yind in range(1, ny+1):
            upper = (binsx[xind], binsy[yind])
            lower = (binsx[xind-1], binsy[yind-1])
            #print upper,lower,
            probs[xind-1,yind-1] = prob_bv_rectangle(lower, upper, cdf)

    assert not np.isnan(probs).any()
    return probs
Пример #13
0
def approx_hess2(x, f, epsilon=None, args=(), kwargs={}, return_grad=False):
    #
    n = len(x)
    # NOTE: ridout suggesting using eps**(1/4)*theta
    h = _get_epsilon(x, 3, epsilon, n)
    ee = np.diag(h)
    f0 = f(*((x,)+args), **kwargs)
    # Compute forward step
    g = np.zeros(n)
    gg = np.zeros(n)
    for i in range(n):
        g[i] = f(*((x+ee[i, :],)+args), **kwargs)
        gg[i] = f(*((x-ee[i, :],)+args), **kwargs)

    hess = np.outer(h, h)  # this is now epsilon**2
    # Compute "double" forward step
    for i in range(n):
        for j in range(i, n):
            hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) -
                          g[i] - g[j] + f0 +
                          f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs) -
                          gg[i] - gg[j] + f0)/(2 * hess[i, j])
            hess[j, i] = hess[i, j]
    if return_grad:
        grad = (g - f0)/h
        return hess, grad
    else:
        return hess
Пример #14
0
    def _prepare_structured_array(self, data):
        self.nobs = len(data)
        self.nvar = len(data.dtype)
        self.data = data
        self.datarows = iter(data)
        dtype = data.dtype
        descr = dtype.descr
        if dtype.names is None:
            varlist = _default_names(self.nvar)
        else:
            varlist = dtype.names

        # check for datetime and change the type
        convert_dates = self._convert_dates
        if convert_dates is not None:
            convert_dates = _maybe_convert_to_int_keys(convert_dates,
                                                      varlist)
            self._convert_dates = convert_dates
            for key in convert_dates:
                descr[key] = (
                        descr[key][0],
                        _convert_datetime_to_stata_type(convert_dates[key])
                                )
            dtype = np.dtype(descr)

        self.varlist = varlist
        self.typlist = [_dtype_to_stata_type(dtype[i])
                        for i in range(self.nvar)]
        self.fmtlist = [_dtype_to_default_stata_fmt(dtype[i])
                        for i in range(self.nvar)]
        # set the given format for the datetime cols
        if convert_dates is not None:
            for key in convert_dates:
                self.fmtlist[key] = convert_dates[key]
Пример #15
0
    def get_columns(self, *args, **kw):
        """
        Calling function for factor instance.
        """

        v = self.namespace[self._name]
        while True:
            if callable(v):
                if isinstance(v, (Term, Formula)):
                    v = copy.copy(v)
                    v.namespace = self.namespace
                v = v(*args, **kw)
            else: break

        n = len(v)

        if self.ordinal:
            col = [float(self.keys.index(v[i])) for i in range(n)]
            return np.array(col)

        else:
            value = []
            for key in self.keys:
                col = [float((v[i] == key)) for i in range(n)]
                value.append(col)
            return np.array(value)
Пример #16
0
    def _omega_forc_cov(self, steps):
        # Approximate MSE matrix \Omega(h) as defined in Lut p97
        G = self._zz
        Ginv = L.inv(G)

        # memoize powers of B for speedup
        # TODO: see if can memoize better
        B = self._bmat_forc_cov()
        _B = {}
        def bpow(i):
            if i not in _B:
                _B[i] = np.linalg.matrix_power(B, i)

            return _B[i]

        phis = self.ma_rep(steps)
        sig_u = self.sigma_u

        omegas = np.zeros((steps, self.neqs, self.neqs))
        for h in range(1, steps + 1):
            if h == 1:
                omegas[h-1] = self.df_model * self.sigma_u
                continue

            om = omegas[h-1]
            for i in range(h):
                for j in range(h):
                    Bi = bpow(h - 1 - i)
                    Bj = bpow(h - 1 - j)
                    mult = np.trace(chain_dot(Bi.T, Ginv, Bj, G))
                    om += mult * chain_dot(phis[i], sig_u, phis[j].T)
            omegas[h-1] = om

        return omegas
Пример #17
0
def _band2array(a, lower=0, symmetric=False, hermitian=False):
    """
    Take an upper or lower triangular banded matrix and return a
    numpy array.

    INPUTS:
       a         -- a matrix in upper or lower triangular banded matrix
       lower     -- is the matrix upper or lower triangular?
       symmetric -- if True, return the original result plus its transpose
       hermitian -- if True (and symmetric False), return the original
                    result plus its conjugate transposed

    """

    n = a.shape[1]
    r = a.shape[0]
    _a = 0

    if not lower:
        for j in range(r):
            _b = np.diag(a[r-1-j],k=j)[j:(n+j),j:(n+j)]
            _a += _b
            if symmetric and j > 0: _a += _b.T
            elif hermitian and j > 0: _a += _b.conjugate().T
    else:
        for j in range(r):
            _b = np.diag(a[j],k=j)[0:n,0:n]
            _a += _b
            if symmetric and j > 0: _a += _b.T
            elif hermitian and j > 0: _a += _b.conjugate().T
        _a = _a.T

    return _a
Пример #18
0
                    def product_func(value, d1=d1, d2=d2):

                        out = []
                        for r in range(d1):
                            for s in range(d2):
                                out.append(value[r] * value[d1+s])
                        return np.array(out)
Пример #19
0
def generate_ordinal():

    ## Regression coefficients
    beta = np.zeros(5, dtype=np.float64)
    beta[2] = 1
    beta[4] = -1

    rz = 0.5

    OUT = open("gee_ordinal_1.csv", "w")

    for i in range(200):

        n = np.random.randint(3, 6) # Cluster size

        x = np.random.normal(size=(n,5))
        for j in range(5):
            x[:,j] += np.random.normal()
        pr = np.dot(x, beta)
        pr = np.array([1,0,-0.5]) + pr[:,None]
        pr = 1 / (1 + np.exp(-pr))

        z = rz*np.random.normal() +\
            np.sqrt(1-rz**2)*np.random.normal(size=n)
        u = norm.cdf(z)

        y = (u[:,None] > pr).sum(1)

        for j in range(n):
            OUT.write("%d,%d," % (i, y[j]))
            OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n")

    OUT.close()
Пример #20
0
def generate_poisson():

    ## Regression coefficients
    beta = np.zeros(5, dtype=np.float64)
    beta[2] = 0.5
    beta[4] = -0.5

    nclust = 100

    rz = 0.5

    OUT = open("gee_poisson_1.csv", "w")

    for i in range(nclust):

        n = np.random.randint(3, 6) # Cluster size

        x = np.random.normal(size=(n,5))
        for j in range(5):
            x[:,j] += np.random.normal()
        lp = np.dot(x, beta)
        E = np.exp(lp)
        y = [np.random.poisson(e) for e in E]
        y = np.array(y)

        for j in range(n):
            OUT.write("%d,%d," % (i, y[j]))
            OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n")

    OUT.close()
Пример #21
0
def prob_mv_grid(bins, cdf, axis=-1):
    '''helper function for probability of a rectangle grid in a multivariate distribution

    how does this generalize to more than 2 variates ?

    bins : tuple
        tuple of bin edges, currently it is assumed that they broadcast
        correctly

    '''
    if not isinstance(bins, np.ndarray):
        bins = lmap(np.asarray, bins)
        n_dim = len(bins)
        bins_ = []
        #broadcast if binedges are 1d
        if all(lmap(np.ndim, bins) == np.ones(n_dim)):
            for d in range(n_dim):
                sl = [None]*n_dim
                sl[d] = slice(None)
                bins_.append(bins[d][sl])
    else: #assume it is already correctly broadcasted
        n_dim = bins.shape[0]
        bins_ = bins

    print(len(bins))
    cdf_values = cdf(bins_)
    probs = cdf_values.copy()
    for d in range(n_dim):
        probs = np.diff(probs, axis=d)

    return probs
Пример #22
0
    def _compute_J(self, A_solve, B_solve):

        #first compute appropriate duplication matrix
        # taken from Magnus and Neudecker (1980),
        #"The Elimination Matrix: Some Lemmas and Applications
        # the creation of the D_n matrix follows MN (1980) directly,
        #while the rest follows Hamilton (1994)

        neqs = self.neqs
        sigma_u = self.sigma_u
        A_mask = self.A_mask
        B_mask = self.B_mask

        #first generate duplication matrix, see MN (1980) for notation

        D_nT = np.zeros([int((1.0 / 2) * (neqs) * (neqs + 1)), neqs**2])

        for j in range(neqs):
            i=j
            while j <= i < neqs:
                u=np.zeros([int((1.0/2)*neqs*(neqs+1)), 1])
                u[int(j * neqs + (i + 1) - (1.0 / 2) * (j + 1) * j - 1)] = 1
                Tij=np.zeros([neqs,neqs])
                Tij[i,j]=1
                Tij[j,i]=1
                D_nT=D_nT+np.dot(u,(Tij.ravel('F')[:,None]).T)
                i=i+1

        D_n=D_nT.T
        D_pl=npl.pinv(D_n)

        #generate S_B
        S_B = np.zeros((neqs**2, len(A_solve[A_mask])))
        S_D = np.zeros((neqs**2, len(B_solve[B_mask])))

        j = 0
        j_d = 0
        if len(A_solve[A_mask]) is not 0:
            A_vec = np.ravel(A_mask, order='F')
            for k in range(neqs**2):
                if A_vec[k] == True:
                    S_B[k,j] = -1
                    j += 1
        if len(B_solve[B_mask]) is not 0:
            B_vec = np.ravel(B_mask, order='F')
            for k in range(neqs**2):
                if B_vec[k] == True:
                    S_D[k,j_d] = 1
                    j_d +=1

        #now compute J
        invA = npl.inv(A_solve)
        J_p1i = np.dot(np.dot(D_pl, np.kron(sigma_u, invA)), S_B)
        J_p1 = -2.0 * J_p1i
        J_p2 = np.dot(np.dot(D_pl, np.kron(invA, invA)), S_D)

        J = np.append(J_p1, J_p2, axis=1)

        return J
    def test_zero_constrained(self):
        # not completely generic yet
        if (isinstance(self.results.model, (sm.GEE))):
            # GEE does not subclass LikelihoodModel
            pytest.skip('GEE does not subclass LikelihoodModel')

        use_start_params = not isinstance(self.results.model,
                                          (sm.RLM, sm.OLS, sm.WLS))
        self.use_start_params = use_start_params  # attach for _get_constrained

        keep_index = list(range(self.results.model.exog.shape[1]))
        # index for params might include extra params
        keep_index_p = list(range(self.results.params.shape[0]))
        drop_index = [1]
        for i in drop_index:
            del keep_index[i]
            del keep_index_p[i]

        if use_start_params:
            res1 = self.results.model._fit_zeros(keep_index, maxiter=500,
                                        start_params=self.results.params)
        else:
            res1 = self.results.model._fit_zeros(keep_index, maxiter=500)

        res2 = self._get_constrained(keep_index, keep_index_p)

        assert_allclose(res1.params[keep_index_p], res2.params, rtol=1e-10,
                        atol=1e-10)
        assert_equal(res1.params[drop_index], 0)
        assert_allclose(res1.bse[keep_index_p], res2.bse, rtol=1e-10,
                        atol=1e-10)
        assert_equal(res1.bse[drop_index], 0)
        # OSX has many slight failures on this test
        tol = 1e-8 if PLATFORM_OSX else 1e-10
        assert_allclose(res1.tvalues[keep_index_p], res2.tvalues, rtol=tol,
                        atol=tol)
        assert_allclose(res1.pvalues[keep_index_p], res2.pvalues, rtol=tol,
                        atol=tol)

        if hasattr(res1, 'resid'):
            # discrete models, Logit don't have `resid` yet
            # atol discussion at gh-5158
            rtol = 1e-10
            atol = 1e-12
            if PLATFORM_OSX:
                # GH 5628
                rtol = 1e-8
                atol = 1e-10
            assert_allclose(res1.resid, res2.resid, rtol=rtol, atol=atol)

        ex = self.results.model.exog.mean(0)
        predicted1 = res1.predict(ex, **self.predict_kwds)
        predicted2 = res2.predict(ex[keep_index], **self.predict_kwds)
        assert_allclose(predicted1, predicted2, rtol=1e-10)

        ex = self.results.model.exog[:5]
        predicted1 = res1.predict(ex, **self.predict_kwds)
        predicted2 = res2.predict(ex[:, keep_index], **self.predict_kwds)
        assert_allclose(predicted1, predicted2, rtol=1e-10)
Пример #24
0
    def err_band_sz1(self, orth=False, svar=False, repl=1000,
                     signif=0.05, seed=None, burn=100, component=None):
        """
        IRF Sims-Zha error band method 1. Assumes symmetric error bands around
        mean.

        Parameters
        ----------
        orth : bool, default False
            Compute orthogonalized impulse responses
        repl : int, default 1000
            Number of MC replications
        signif : float (0 < signif < 1)
            Significance level for error bars, defaults to 95% CI
        seed : int, default None
            np.random seed
        burn : int, default 100
            Number of initial simulated obs to discard
        component : neqs x neqs array, default to largest for each
            Index of column of eigenvector/value to use for each error band
            Note: period of impulse (t=0) is not included when computing
            principle component

        References
        ----------
        Sims, Christopher A., and Tao Zha. 1999. "Error Bands for Impulse
        Response". Econometrica 67: 1113-1155.
        """

        model = self.model
        periods = self.periods
        irfs = self._choose_irfs(orth, svar)
        neqs = self.neqs
        irf_resim = model.irf_resim(orth=orth, repl=repl, T=periods, seed=seed,
                                    burn=100)
        q = util.norm_signif_level(signif)

        W, eigva, k =self._eigval_decomp_SZ(irf_resim)

        if component is not None:
            if np.shape(component) != (neqs,neqs):
                raise ValueError("Component array must be " + str(neqs) + " x " + str(neqs))
            if np.argmax(component) >= neqs*periods:
                raise ValueError("Atleast one of the components does not exist")
            else:
                k = component

        # here take the kth column of W, which we determine by finding the largest eigenvalue of the covaraince matrix
        lower = np.copy(irfs)
        upper = np.copy(irfs)
        for i in range(neqs):
            for j in range(neqs):
                lower[1:,i,j] = irfs[1:,i,j] + W[i,j,:,k[i,j]]*q*np.sqrt(eigva[i,j,k[i,j]])
                upper[1:,i,j] = irfs[1:,i,j] - W[i,j,:,k[i,j]]*q*np.sqrt(eigva[i,j,k[i,j]])

        return lower, upper
Пример #25
0
    def select_order(self, maxlag, ic, trend='c', method='mle'):
        """
        Select the lag order according to the information criterion.

        Parameters
        ----------
        maxlag : int
            The highest lag length tried. See `AR.fit`.
        ic : str {'aic','bic','hqic','t-stat'}
            Criterion used for selecting the optimal lag length.
            See `AR.fit`.
        trend : str {'c','nc'}
            Whether to include a constant or not. 'c' - include constant.
            'nc' - no constant.

        Returns
        -------
        bestlag : int
            Best lag according to IC.
        """
        endog = self.endog

        # make Y and X with same nobs to compare ICs
        Y = endog[maxlag:]
        self.Y = Y  # attach to get correct fit stats
        X = self._stackX(maxlag, trend)  # sets k_trend
        self.X = X
        k = self.k_trend  # k_trend set in _stackX
        k = max(1, k)  # handle if startlag is 0
        results = {}

        if ic != 't-stat':
            for lag in range(k, maxlag+1):
                # have to reinstantiate the model to keep comparable models
                endog_tmp = endog[maxlag-lag:]
                fit = AR(endog_tmp).fit(maxlag=lag, method=method,
                                        full_output=0, trend=trend,
                                        maxiter=100, disp=0)
                results[lag] = eval('fit.'+ic)
            bestic, bestlag = min((res, k) for k, res in iteritems(results))

        else:  # choose by last t-stat.
            stop = 1.6448536269514722  # for t-stat, norm.ppf(.95)
            for lag in range(maxlag, k - 1, -1):
                # have to reinstantiate the model to keep comparable models
                endog_tmp = endog[maxlag - lag:]
                fit = AR(endog_tmp).fit(maxlag=lag, method=method,
                                        full_output=0, trend=trend,
                                        maxiter=35, disp=-1)

                bestlag = 0
                if np.abs(fit.tvalues[-1]) >= stop:
                    bestlag = lag
                    break
        return bestlag
Пример #26
0
def interactions(terms, order=[1,2]):
    """
    Output all pairwise interactions of given order of a
    sequence of terms.

    The argument order is a sequence specifying which order
    of interactions should be generated -- the default
    creates main effects and two-way interactions. If order
    is an integer, it is changed to range(1,order+1), so
    order=3 is equivalent to order=[1,2,3], generating
    all one, two and three-way interactions.

    If any entry of order is greater than len(terms), it is
    effectively treated as len(terms).

    >>> print interactions([Term(l) for l in ['a', 'b', 'c']])
    <formula: a*b + a*c + b*c + a + b + c>
    >>>
    >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5)))
    <formula: a*b + a*b*c + a*c + b*c + a + b + c>
    >>>

    """
    l = len(terms)

    values = {}

    if np.asarray(order).shape == ():
        order = lrange(1, int(order)+1)

    # First order

    for o in order:
        I = np.indices((l,)*(o))
        I.shape = (I.shape[0], np.product(I.shape[1:]))
        for m in range(I.shape[1]):

            # only keep combinations that have unique entries

            if (np.unique(I[:,m]).shape == I[:,m].shape and
                    np.alltrue(np.equal(np.sort(I[:,m]), I[:,m]))):
                ll = [terms[j] for j in I[:,m]]
                v = ll[0]
                for ii in range(len(ll)-1):
                    v *= ll[ii+1]
                values[tuple(I[:,m])] = v

    key = list(iterkeys(values))[0]
    value = values[key]
    del(values[key])

    for v in itervalues(values):
        value += v
    return value
Пример #27
0
 def _prepare_ndarray(self, data):
     if data.ndim == 1:
         data = data[:, None]
     self.nobs, self.nvar = data.shape
     self.data = data
     self.datarows = iter(data)
     # TODO: this should be user settable
     dtype = data.dtype
     self.varlist = _default_names(self.nvar)
     self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)]
     self.fmtlist = [_dtype_to_default_stata_fmt(dtype) for i in range(self.nvar)]
Пример #28
0
def _make_exog_names(exog):
    exog_var = exog.var(0)
    if (exog_var == 0).any():
        # assumes one constant in first or last position
        # avoid exception if more than one constant
        const_idx = exog_var.argmin()
        exog_names = ['x%d' % i for i in range(1, exog.shape[1])]
        exog_names.insert(const_idx, 'const')
    else:
        exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)]

    return exog_names
Пример #29
0
def approx_fprime(x, f, epsilon=None, args=(), kwargs={}, centered=False):
    '''
    Gradient of function, or Jacobian if function f returns 1d array

    Parameters
    ----------
    x : array
        parameters at which the derivative is evaluated
    f : function
        `f(*((x,)+args), **kwargs)` returning either one value or 1d array
    epsilon : float, optional
        Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for
        `centered` == False and EPS**(1/3)*x for `centered` == True.
    args : tuple
        Tuple of additional arguments for function `f`.
    kwargs : dict
        Dictionary of additional keyword arguments for function `f`.
    centered : bool
        Whether central difference should be returned. If not, does forward
        differencing.

    Returns
    -------
    grad : array
        gradient or Jacobian

    Notes
    -----
    If f returns a 1d array, it returns a Jacobian. If a 2d array is returned
    by f (e.g., with a value for each observation), it returns a 3d array
    with the Jacobian of each observation with shape xk x nobs x xk. I.e.,
    the Jacobian of the first observation would be [:, 0, :]
    '''
    n = len(x)
    # TODO:  add scaled stepsize
    f0 = f(*((x,)+args), **kwargs)
    dim = np.atleast_1d(f0).shape  # it could be a scalar
    grad = np.zeros((n,) + dim, np.promote_types(float, x.dtype))
    ei = np.zeros((n,), float)
    if not centered:
        epsilon = _get_epsilon(x, 2, epsilon, n)
        for k in range(n):
            ei[k] = epsilon[k]
            grad[k, :] = (f(*((x+ei,) + args), **kwargs) - f0)/epsilon[k]
            ei[k] = 0.0
    else:
        epsilon = _get_epsilon(x, 3, epsilon, n) / 2.
        for k in range(len(x)):
            ei[k] = epsilon[k]
            grad[k, :] = (f(*((x+ei,)+args), **kwargs) -
                          f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k])
            ei[k] = 0.0
    return grad.squeeze().T
Пример #30
0
    def gram(self, d=0):
        """
        Compute Gram inner product matrix, storing it in lower
        triangular banded form.

        The (i,j) entry is

        G_ij = integral b_i^(d) b_j^(d)

        where b_i are the basis elements of the BSpline and (d) is the
        d-th derivative.

        If d is a matrix then, it is assumed to specify a differential
        operator as follows: the first row represents the order of derivative
        with the second row the coefficient corresponding to that order.

        For instance:

        [[2, 3],
         [3, 1]]

        represents 3 * f^(2) + 1 * f^(3).

        INPUTS:
           d    -- which derivative to apply to each basis element,
                   if d is a matrix, it is assumed to specify
                   a differential operator as above

        OUTPUTS: gram
           gram -- the matrix of inner products of (derivatives)
                   of the BSpline elements

        """

        d = np.squeeze(d)
        if np.asarray(d).shape == ():
            self.g = _hbspline.gram(self.tau, self.m, int(d), int(d))
        else:
            d = np.asarray(d)
            if d.shape[0] != 2:
                raise ValueError("if d is not an integer, expecting a jx2 \
                   array with first row indicating order \
                   of derivative, second row coefficient in front.")
            if d.shape == (2,):
                d.shape = (2,1)
            self.g = 0
            for i in range(d.shape[1]):
                for j in range(d.shape[1]):
                    self.g += d[1,i]* d[1,j] * _hbspline.gram(self.tau, self.m, int(d[0,i]), int(d[0,j]))
        self.g = self.g.T
        self.d = d
        return np.nan_to_num(self.g)
Пример #31
0
def summary_params_2dflat(result, endog_names=None, exog_names=None, alpha=0.05,
                          use_t=True, keep_headers=True, endog_cols=False): #skip_headers2=True):
    '''summary table for parameters that are 2d, e.g. multi-equation models

    Parameters
    ----------
    result : result instance
        the result instance with params, bse, tvalues and conf_int
    endog_names : None or list of strings
        names for rows of the parameter array (multivariate endog)
    exog_names : None or list of strings
        names for columns of the parameter array (exog)
    alpha : float
        level for confidence intervals, default 0.95
    use_t : bool
        indicator whether the p-values are based on the Student-t
        distribution (if True) or on the normal distribution (if False)
    keep_headers : bool
        If true (default), then sub-tables keep their headers. If false, then
        only the first headers are kept, the other headerse are blanked out
    endog_cols : bool
        If false (default) then params and other result statistics have
        equations by rows. If true, then equations are assumed to be in columns.
        Not implemented yet.

    Returns
    -------
    tables : list of SimpleTable
        this contains a list of all seperate Subtables
    table_all : SimpleTable
        the merged table with results concatenated for each row of the parameter
        array

    '''

    res = result
    params = res.params
    if params.ndim == 2: # we've got multiple equations
        n_equ = params.shape[1]
        if not len(endog_names) == params.shape[1]:
            raise ValueError('endog_names has wrong length')
    else:
        if not len(endog_names) == len(params):
            raise ValueError('endog_names has wrong length')
        n_equ = 1

    #VAR doesn't have conf_int
    #params = res.params.T # this is a convention for multi-eq models

    if not isinstance(endog_names, list):
        #this might be specific to multinomial logit type, move?
        if endog_names is None:
            endog_basename = 'endog'
        else:
            endog_basename = endog_names
        #TODO: note, the [1:] is specific to current MNLogit
        endog_names = res.model.endog_names[1:]

    #check if we have the right length of names

    tables = []
    for eq in range(n_equ):
        restup = (res, res.params[:,eq], res.bse[:,eq], res.tvalues[:,eq],
                  res.pvalues[:,eq], res.conf_int(alpha)[eq])

        #not used anymore in current version
#        if skip_headers2:
#            skiph = (row != 0)
#        else:
#            skiph = False
        skiph = False
        tble = summary_params(restup, yname=endog_names[eq],
                              xname=exog_names, alpha=alpha, use_t=use_t,
                              skip_header=skiph)

        tables.append(tble)

    #add titles, they will be moved to header lines in table_extend
    for i in range(len(endog_names)):
        tables[i].title = endog_names[i]

    table_all = table_extend(tables, keep_headers=keep_headers)

    return tables, table_all
Пример #32
0
    def test_zero_constrained(self):
        # not completely generic yet
        if (isinstance(self.results.model, (sm.GEE))):
            # GEE does not subclass LikelihoodModel
            pytest.skip('GEE does not subclass LikelihoodModel')

        use_start_params = not isinstance(self.results.model,
                                          (sm.RLM, sm.OLS, sm.WLS))
        self.use_start_params = use_start_params  # attach for _get_constrained

        keep_index = list(range(self.results.model.exog.shape[1]))
        # index for params might include extra params
        keep_index_p = list(range(self.results.params.shape[0]))
        drop_index = [1]
        for i in drop_index:
            del keep_index[i]
            del keep_index_p[i]

        if use_start_params:
            res1 = self.results.model._fit_zeros(
                keep_index, maxiter=500, start_params=self.results.params)
        else:
            res1 = self.results.model._fit_zeros(keep_index, maxiter=500)

        res2 = self._get_constrained(keep_index, keep_index_p)

        assert_allclose(res1.params[keep_index_p],
                        res2.params,
                        rtol=1e-10,
                        atol=1e-10)
        assert_equal(res1.params[drop_index], 0)
        assert_allclose(res1.bse[keep_index_p],
                        res2.bse,
                        rtol=1e-10,
                        atol=1e-10)
        assert_equal(res1.bse[drop_index], 0)
        assert_allclose(res1.tvalues[keep_index_p],
                        res2.tvalues,
                        rtol=1e-10,
                        atol=1e-10)
        assert_allclose(res1.pvalues[keep_index_p],
                        res2.pvalues,
                        rtol=1e-10,
                        atol=1e-10)

        if hasattr(res1, 'resid'):
            # discrete models, Logit don't have `resid` yet
            # atol discussion at gh-5158
            rtol = 1e-10
            atol = 1e-12
            if PLATFORM_OSX:
                # GH 5628
                rtol = 1e-8
                atol = 1e-10
            assert_allclose(res1.resid, res2.resid, rtol=rtol, atol=atol)

        ex = self.results.model.exog.mean(0)
        predicted1 = res1.predict(ex, **self.predict_kwds)
        predicted2 = res2.predict(ex[keep_index], **self.predict_kwds)
        assert_allclose(predicted1, predicted2, rtol=1e-10)

        ex = self.results.model.exog[:5]
        predicted1 = res1.predict(ex, **self.predict_kwds)
        predicted2 = res2.predict(ex[:, keep_index], **self.predict_kwds)
        assert_allclose(predicted1, predicted2, rtol=1e-10)
Пример #33
0
def add_lag(x, col=None, lags=1, drop=False, insert=True):
    """
    Returns an array with lags included given an array.

    Parameters
    ----------
    x : array
        An array or NumPy ndarray subclass. Can be either a 1d or 2d array with
        observations in columns.
    col : 'string', int, or None
        If data is a structured array or a recarray, `col` can be a string
        that is the name of the column containing the variable. Or `col` can
        be an int of the zero-based column index. If it's a 1d array `col`
        can be None.
    lags : int
        The number of lags desired.
    drop : bool
        Whether to keep the contemporaneous variable for the data.
    insert : bool or int
        If True, inserts the lagged values after `col`. If False, appends
        the data. If int inserts the lags at int.

    Returns
    -------
    array : ndarray
        Array with lags

    Examples
    --------

    >>> import statsmodels.api as sm
    >>> data = sm.datasets.macrodata.load(as_pandas=False)
    >>> data = data.data[['year','quarter','realgdp','cpi']]
    >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2)

    Notes
    -----
    Trims the array both forward and backward, so that the array returned
    so that the length of the returned array is len(`X`) - lags. The lags are
    returned in increasing order, ie., t-1,t-2,...,t-lags
    """
    if x.dtype.names:
        names = x.dtype.names
        if not col and np.squeeze(x).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        elif len(names) == 1:
            col = names[0]
        if isinstance(col, (int, long)):
            col = x.dtype.names[col]
        if not PY3:
            # TODO: Get rid of this kludge.  See GH # 3658
            names = [bytes(name)
                     if isinstance(name, unicode)  # noqa:F821
                     else name for name in names]
            # Fail loudly if there is a non-ascii name.
            x.dtype.names = names
            if isinstance(col, unicode):  # noqa:F821
                col = bytes(col)

        contemp = x[col]

        # make names for lags
        tmp_names = [col + '_'+'L(%i)' % i for i in range(1, lags+1)]
        ndlags = lagmat(contemp, maxlag=lags, trim='Both')

        # get index for return
        if insert is True:
            ins_idx = list(names).index(col) + 1
        elif insert is False:
            ins_idx = len(names) + 1
        else: # insert is an int
            if insert > len(names):
                import warnings
                warnings.warn("insert > number of variables, inserting at the"
                              " last position", ValueWarning)
            ins_idx = insert

        first_names = list(names[:ins_idx])
        last_names = list(names[ins_idx:])

        if drop:
            if col in first_names:
                first_names.pop(first_names.index(col))
            else:
                last_names.pop(last_names.index(col))

        if first_names: # only do this if x isn't "empty"
            # Workaround to avoid NumPy FutureWarning
            _x = recarray_select(x, first_names)
            first_arr = nprf.append_fields(_x[lags:], tmp_names, ndlags.T,
                                           usemask=False)

        else:
            first_arr = np.zeros(len(x)-lags, dtype=lzip(tmp_names,
                (x[col].dtype,)*lags))
            for i,name in enumerate(tmp_names):
                first_arr[name] = ndlags[:,i]
        if last_names:
            return nprf.append_fields(first_arr, last_names,
                    [x[name][lags:] for name in last_names], usemask=False)
        else: # lags for last variable
            return first_arr

    else: # we have an ndarray

        if x.ndim == 1: # make 2d if 1d
            x = x[:,None]
        if col is None:
            col = 0

        # handle negative index
        if col < 0:
            col = x.shape[1] + col

        contemp = x[:,col]

        if insert is True:
            ins_idx = col + 1
        elif insert is False:
            ins_idx = x.shape[1]
        else:
            if insert < 0: # handle negative index
                insert = x.shape[1] + insert + 1
            if insert > x.shape[1]:
                insert = x.shape[1]
                import warnings
                warnings.warn("insert > number of variables, inserting at the"
                              " last position", ValueWarning)
            ins_idx = insert

        ndlags = lagmat(contemp, lags, trim='Both')
        first_cols = lrange(ins_idx)
        last_cols = lrange(ins_idx,x.shape[1])
        if drop:
            if col in first_cols:
                first_cols.pop(first_cols.index(col))
            else:
                last_cols.pop(last_cols.index(col))
        return np.column_stack((x[lags:,first_cols],ndlags,
                    x[lags:,last_cols]))
Пример #34
0
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward',
              use_pandas=False):
    """
    Generate lagmatrix for 2d array, columns arranged by variables

    Parameters
    ----------
    x : array_like, 2d
        2d data, observation in rows and variables in columns
    maxlag0 : int
        for first variable all lags from zero to maxlag are included
    maxlagex : None or int
        max lag for all other variables all lags from zero to maxlag are included
    dropex : int (default is 0)
        exclude first dropex lags from other variables
        for all variables, except the first, lags from dropex to maxlagex are
        included
    trim : string
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none' : no trimming of observations
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations, columns ordered by variable

    Notes
    -----
    Inefficient implementation for unequal lags, implemented for convenience
    """

    if maxlagex is None:
        maxlagex = maxlag0
    maxlag = max(maxlag0, maxlagex)
    is_pandas = _is_using_pandas(x, None)

    if x.ndim == 1:
        if is_pandas:
            x = pd.DataFrame(x)
        else:
            x = x[:, None]
    elif x.ndim == 0 or x.ndim > 2:
        raise ValueError('Only supports 1 and 2-dimensional data.')

    nobs, nvar = x.shape

    if is_pandas and use_pandas:
        lags = lagmat(x.iloc[:, 0], maxlag, trim=trim,
                      original='in', use_pandas=True)
        lagsli = [lags.iloc[:, :maxlag0 + 1]]
        for k in range(1, nvar):
            lags = lagmat(x.iloc[:, k], maxlag, trim=trim,
                          original='in', use_pandas=True)
            lagsli.append(lags.iloc[:, dropex:maxlagex + 1])
        return pd.concat(lagsli, axis=1)
    elif is_pandas:
        x = np.asanyarray(x)

    lagsli = [lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]]
    for k in range(1, nvar):
        lagsli.append(lagmat(x[:, k], maxlag, trim=trim, original='in')[:, dropex:maxlagex + 1])
    return np.column_stack(lagsli)
thread_pool_num = int(sys.argv[3])
alpha = float(sys.argv[4])

# data_input = spark.read.csv(data_file_name, header=True, inferSchema=True).cache()
data_input = spark.read.csv(data_file_name, header=True, inferSchema=True).persist(
    pyspark.StorageLevel.MEMORY_AND_DISK_2)

k = len(data_input.columns)
data_input = data_input.withColumn("id", monotonically_increasing_id())

# add time lag for all x_name columns
w = Window().orderBy(col("id"))
x_list = data_input.columns
x_list.remove("id")
for x_name_item in x_list:
    for i in range(1, maxlag + 1):
        data_input = data_input.withColumn("%s_t-%s" % (x_name_item, str(i)),
                                           lag(data_input[x_name_item], i, 0).over(w))

data_input.cache()
# data_input.show(5)
# spark.stop()
n = data_input.count()


# print(x_list)
# maxlag = 3


def regression(x_name, y_name, maxlag, data=data_input):
    print("!!!!!!!!!start regression!!!!!!!!!")
Пример #36
0
def test_lutkepohl_parse():
    files = ['e%d' % i for i in range(1, 7)]

    for f in files:
        get_lutkepohl_data(f)
Пример #37
0
 def test_irf_stderr(self):
     irf_stderr = self.irf.stderr(orth=False)
     for i in range(1, 1 + len(self.lut.irf_stderr)):
         assert_almost_equal(np.round(irf_stderr[i], 3),
                             self.lut.irf_stderr[i-1])
Пример #38
0
    def update(self, params):

        if self.model.weights is not None:
            warnings.warn(
                "weights not implemented for autoregressive "
                "cov_struct, using unweighted covariance estimate",
                NotImplementedWarning)

        endog = self.model.endog_li
        time = self.model.time_li

        # Only need to compute this once
        if self.designx is not None:
            designx = self.designx
        else:
            designx = []
            for i in range(self.model.num_group):

                ngrp = len(endog[i])
                if ngrp == 0:
                    continue

                # Loop over pairs of observations within a cluster
                for j1 in range(ngrp):
                    for j2 in range(j1):
                        designx.append(
                            self.dist_func(time[i][j1, :], time[i][j2, :]))

            designx = np.array(designx)
            self.designx = designx

        scale = self.model.estimate_scale()
        varfunc = self.model.family.variance
        cached_means = self.model.cached_means

        # Weights
        var = 1. - self.dep_params**(2 * designx)
        var /= 1. - self.dep_params**2
        wts = 1. / var
        wts /= wts.sum()

        residmat = []
        for i in range(self.model.num_group):

            expval, _ = cached_means[i]
            stdev = np.sqrt(scale * varfunc(expval))
            resid = (endog[i] - expval) / stdev

            ngrp = len(resid)
            for j1 in range(ngrp):
                for j2 in range(j1):
                    residmat.append([resid[j1], resid[j2]])

        residmat = np.array(residmat)

        # Need to minimize this
        def fitfunc(a):
            dif = residmat[:, 0] - (a**designx) * residmat[:, 1]
            return np.dot(dif**2, wts)

        # Left bracket point
        b_lft, f_lft = 0., fitfunc(0.)

        # Center bracket point
        b_ctr, f_ctr = 0.5, fitfunc(0.5)
        while f_ctr > f_lft:
            b_ctr /= 2
            f_ctr = fitfunc(b_ctr)
            if b_ctr < 1e-8:
                self.dep_params = 0
                return

        # Right bracket point
        b_rgt, f_rgt = 0.75, fitfunc(0.75)
        while f_rgt < f_ctr:
            b_rgt = b_rgt + (1. - b_rgt) / 2
            f_rgt = fitfunc(b_rgt)
            if b_rgt > 1. - 1e-6:
                raise ValueError(
                    "Autoregressive: unable to find right bracket")

        from scipy.optimize import brent
        self.dep_params = brent(fitfunc, brack=[b_lft, b_ctr, b_rgt])
Пример #39
0
    def covariance_matrix_solve(self, expval, index, stdev, rhs):
        """
        Solves matrix equations of the form `covmat * soln = rhs` and
        returns the values of `soln`, where `covmat` is the covariance
        matrix represented by this class.

        Parameters
        ----------
        expval: array-like
           The expected value of endog for each observed value in the
           group.
        index: integer
           The group index.
        stdev : array-like
            The standard deviation of endog for each observation in
            the group.
        rhs : list/tuple of array-like
            A set of right-hand sides; each defines a matrix equation
            to be solved.

        Returns
        -------
        soln : list/tuple of array-like
            The solutions to the matrix equations.

        Notes
        -----
        Returns None if the solver fails.

        Some dependence structures do not use `expval` and/or `index`
        to determine the correlation matrix.  Some families
        (e.g. binomial) do not use the `stdev` parameter when forming
        the covariance matrix.

        If the covariance matrix is singular or not SPD, it is
        projected to the nearest such matrix.  These projection events
        are recorded in the fit_history member of the GEE model.

        Systems of linear equations with the covariance matrix as the
        left hand side (LHS) are solved for different right hand sides
        (RHS); the LHS is only factorized once to save time.

        This is a default implementation, it can be reimplemented in
        subclasses to optimize the linear algebra according to the
        struture of the covariance matrix.
        """

        vmat, is_cor = self.covariance_matrix(expval, index)
        if is_cor:
            vmat *= np.outer(stdev, stdev)

        # Factor the covariance matrix.  If the factorization fails,
        # attempt to condition it into a factorizable matrix.
        threshold = 1e-2
        success = False
        cov_adjust = 0
        for itr in range(20):
            try:
                vco = spl.cho_factor(vmat)
                success = True
                break
            except np.linalg.LinAlgError:
                vmat = cov_nearest(vmat,
                                   method=self.cov_nearest_method,
                                   threshold=threshold)
                threshold *= 2
                cov_adjust += 1

        self.cov_adjust.append(cov_adjust)

        # Last resort if we still can't factor the covariance matrix.
        if not success:
            warnings.warn(
                "Unable to condition covariance matrix to an SPD "
                "matrix using cov_nearest", ConvergenceWarning)
            vmat = np.diag(np.diag(vmat))
            vco = spl.cho_factor(vmat)

        soln = [spl.cho_solve(vco, x) for x in rhs]
        return soln
Пример #40
0
def lowess(endog, exog, frac=2. / 3, it=3):
    """
    LOWESS (Locally Weighted Scatterplot Smoothing)

    A lowess function that outs smoothed estimates of endog
    at the given exog values from points (exog, endog)

    Parameters
    ----------
    endog: 1-D numpy array
        The y-values of the observed points
    exog: 1-D numpy array
        The x-values of the observed points
    frac: float
        Between 0 and 1. The fraction of the data used
        when estimating each y-value.
    it: int
        The number of residual-based reweightings
        to perform.

    Returns
    -------
    out: numpy array
        A numpy array with two columns. The first column
        is the sorted x values and the second column the
        associated estimated y-values.

    Notes
    -----
    This lowess function implements the algorithm given in the
    reference below using local linear estimates.

    Suppose the input data has N points. The algorithm works by
    estimating the true ``y_i`` by taking the frac*N closest points
    to ``(x_i,y_i)`` based on their x values and estimating ``y_i``
    using a weighted linear regression. The weight for ``(x_j,y_j)``
    is `_lowess_tricube` function applied to ``|x_i-x_j|``.

    If ``iter > 0``, then further weighted local linear regressions
    are performed, where the weights are the same as above
    times the `_lowess_bisquare` function of the residuals. Each iteration
    takes approximately the same amount of time as the original fit,
    so these iterations are expensive. They are most useful when
    the noise has extremely heavy tails, such as Cauchy noise.
    Noise with less heavy-tails, such as t-distributions with ``df > 2``,
    are less problematic. The weights downgrade the influence of
    points with large residuals. In the extreme case, points whose
    residuals are larger than 6 times the median absolute residual
    are given weight 0.

    Some experimentation is likely required to find a good
    choice of frac and iter for a particular dataset.

    References
    ----------
    Cleveland, W.S. (1979) "Robust Locally Weighted Regression
    and Smoothing Scatterplots". Journal of the American Statistical
    Association 74 (368): 829-836.

    Examples
    --------
    The below allows a comparison between how different the fits from
    `lowess` for different values of frac can be.

    >>> import numpy as np
    >>> import statsmodels.api as sm
    >>> lowess = sm.nonparametric.lowess
    >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500)
    >>> y = np.sin(x) + np.random.normal(size=len(x))
    >>> z = lowess(y, x)
    >>> w = lowess(y, x, frac=1./3)

    This gives a similar comparison for when it is 0 vs not.

    >>> import scipy.stats as stats
    >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500)
    >>> y = np.sin(x) + stats.cauchy.rvs(size=len(x))
    >>> z = lowess(y, x, frac= 1./3, it=0)
    >>> w = lowess(y, x, frac=1./3)

    """
    x = exog

    if exog.ndim != 1:
        raise ValueError('exog must be a vector')
    if endog.ndim != 1:
        raise ValueError('endog must be a vector')
    if endog.shape[0] != x.shape[0]:
        raise ValueError('exog and endog must have same length')

    n = exog.shape[0]
    fitted = np.zeros(n)

    k = int(frac * n)

    index_array = np.argsort(exog)
    x_copy = np.array(exog[index_array])  #, dtype ='float32')
    y_copy = endog[index_array]

    fitted, weights = _lowess_initial_fit(x_copy, y_copy, k, n)

    for i in range(it):
        _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n)

    out = np.array([x_copy, fitted]).T
    out.shape = (n, 2)

    return out
Пример #41
0
def convolution_filter(x, filt, nsides=2):
    '''
    Linear filtering via convolution. Centered and backward displaced moving
    weighted average.

    Parameters
    ----------
    x : array_like
        data array, 1d or 2d, if 2d then observations in rows
    filt : array_like
        Linear filter coefficients in reverse time-order. Should have the
        same number of dimensions as x though if 1d and ``x`` is 2d will be
        coerced to 2d.
    nsides : int, optional
        If 2, a centered moving average is computed using the filter
        coefficients. If 1, the filter coefficients are for past values only.
        Both methods use scipy.signal.convolve.

    Returns
    -------
    y : ndarray, 2d
        Filtered array, number of columns determined by x and filt. If a
        pandas object is given, a pandas object is returned. The index of
        the return is the exact same as the time period in ``x``

    Notes
    -----
    In nsides == 1, x is filtered ::

        y[n] = filt[0]*x[n-1] + ... + filt[n_filt-1]*x[n-n_filt]

    where n_filt is len(filt).

    If nsides == 2, x is filtered around lag 0 ::

        y[n] = filt[0]*x[n - n_filt/2] + ... + filt[n_filt / 2] * x[n]
               + ... + x[n + n_filt/2]

    where n_filt is len(filt). If n_filt is even, then more of the filter
    is forward in time than backward.

    If filt is 1d or (nlags,1) one lag polynomial is applied to all
    variables (columns of x). If filt is 2d, (nlags, nvars) each series is
    independently filtered with its own lag polynomial, uses loop over nvar.
    This is different than the usual 2d vs 2d convolution.

    Filtering is done with scipy.signal.convolve, so it will be reasonably
    fast for medium sized data. For large data fft convolution would be
    faster.
    '''
    # for nsides shift the index instead of using 0 for 0 lag this
    # allows correct handling of NaNs
    if nsides == 1:
        trim_head = len(filt) - 1
        trim_tail = None
    elif nsides == 2:
        trim_head = int(np.ceil(len(filt) / 2.) - 1) or None
        trim_tail = int(np.ceil(len(filt) / 2.) - len(filt) % 2) or None
    else:  # pragma : no cover
        raise ValueError("nsides must be 1 or 2")

    pw = PandasWrapper(x)
    x = array_like(x, 'x', maxdim=2)
    filt = array_like(filt, 'filt', ndim=x.ndim)

    if filt.ndim == 1 or min(filt.shape) == 1:
        result = signal.convolve(x, filt, mode='valid')
    elif filt.ndim == 2:
        nlags = filt.shape[0]
        nvar = x.shape[1]
        result = np.zeros((x.shape[0] - nlags + 1, nvar))
        if nsides == 2:
            for i in range(nvar):
                # could also use np.convolve, but easier for swiching to fft
                result[:, i] = signal.convolve(x[:, i],
                                               filt[:, i],
                                               mode='valid')
        elif nsides == 1:
            for i in range(nvar):
                result[:, i] = signal.convolve(x[:, i],
                                               np.r_[0, filt[:, i]],
                                               mode='valid')
    result = _pad_nans(result, trim_head, trim_tail)
    return pw.wrap(result)
Пример #42
0
def summary(self, yname=None, xname=None, title=0, alpha=.05,
            returns='text', model_info=None):
    """
    Parameters
    -----------
    yname : string
            optional, Default is `Y`
    xname : list of strings
            optional, Default is `X.#` for # in p the number of regressors
    Confidance interval : (0,1) not implimented
    title : string
            optional, Defualt is 'Generalized linear model'
    returns : string
              'text', 'table', 'csv', 'latex', 'html'

    Returns
    -------
    Default :
    returns='print'
            Prints the summarirized results

    Option :
    returns='text'
            Prints the summarirized results

    Option :
    returns='table'
             SimpleTable instance : summarizing the fit of a linear model.

    Option :
    returns='csv'
            returns a string of csv of the results, to import into a spreadsheet

    Option :
    returns='latex'
    Not implimented yet

    Option :
    returns='HTML'
    Not implimented yet


    Examples (needs updating)
    --------
    >>> import statsmodels as sm
    >>> data = sm.datasets.longley.load(as_pandas=False)
    >>> data.exog = sm.add_constant(data.exog)
    >>> ols_results = sm.OLS(data.endog, data.exog).results
    >>> print ols_results.summary()
    ...

    Notes
    -----
    conf_int calculated from normal dist.
    """
    import time as time



    #TODO Make sure all self.model.__class__.__name__ are listed
    model_types = {'OLS' : 'Ordinary least squares',
                   'GLS' : 'Generalized least squares',
                   'GLSAR' : 'Generalized least squares with AR(p)',
                   'WLS' : 'Weighted least squares',
                   'RLM' : 'Robust linear model',
                   'GLM' : 'Generalized linear model'
                   }
    model_methods = {'OLS' : 'Least Squares',
                   'GLS' : 'Least Squares',
                   'GLSAR' : 'Least Squares',
                   'WLS' : 'Least Squares',
                   'RLM' : '?',
                   'GLM' : '?'}
    if title==0:
        title = model_types[self.model.__class__.__name__]
    if yname is None:
        try:
            yname = self.model.endog_names
        except AttributeError:
            yname = 'y'
    if xname is None:
        try:
            xname = self.model.exog_names
        except AttributeError:
            xname = ['var_%d' % i for i in range(len(self.params))]
    time_now = time.localtime()
    time_of_day = [time.strftime("%H:%M:%S", time_now)]
    date = time.strftime("%a, %d %b %Y", time_now)
    modeltype = self.model.__class__.__name__
    #dist_family = self.model.family.__class__.__name__
    nobs = self.nobs
    df_model = self.df_model
    df_resid = self.df_resid

    #General part of the summary table, Applicable to all? models
    #------------------------------------------------------------
    #TODO: define this generically, overwrite in model classes
    #replace definition of stubs data by single list
    #e.g.
    gen_left =   [('Model type:', [modeltype]),
                  ('Date:', [date]),
                  ('Dependent Variable:', yname), #What happens with multiple names?
                  ('df model', [df_model])
                  ]
    gen_stubs_left, gen_data_left = zip_longest(*gen_left) #transpose row col

    gen_title = title
    gen_header = None
##    gen_stubs_left = ('Model type:',
##                      'Date:',
##                      'Dependent Variable:',
##                      'df model'
##                  )
##    gen_data_left = [[modeltype],
##                     [date],
##                     yname, #What happens with multiple names?
##                     [df_model]
##                     ]
    gen_table_left = SimpleTable(gen_data_left,
                                 gen_header,
                                 gen_stubs_left,
                                 title = gen_title,
                                 txt_fmt = gen_fmt
                                 )

    gen_stubs_right = ('Method:',
                       'Time:',
                       'Number of Obs:',
                       'df resid')
    gen_data_right = ([modeltype], #was dist family need to look at more
                      time_of_day,
                      [nobs],
                      [df_resid]
                      )
    gen_table_right = SimpleTable(gen_data_right,
                                  gen_header,
                                  gen_stubs_right,
                                  title = gen_title,
                                  txt_fmt = gen_fmt)
    gen_table_left.extend_right(gen_table_right)
    general_table = gen_table_left

    #Parameters part of the summary table
    #------------------------------------
    #Note: this is not necessary since we standardized names, only t versus normal
    tstats = {'OLS' : self.t(),
            'GLS' : self.t(),
            'GLSAR' : self.t(),
            'WLS' : self.t(),
            'RLM' : self.t(),
            'GLM' : self.t()}
    prob_stats = {'OLS' : self.pvalues,
                  'GLS' : self.pvalues,
                  'GLSAR' : self.pvalues,
                  'WLS' : self.pvalues,
                  'RLM' : self.pvalues,
                  'GLM' : self.pvalues
                  }
    #Dictionary to store the header names for the parameter part of the
    #summary table. look up by modeltype
    alp = str((1-alpha)*100)+'%'
    param_header = {
         'OLS'   : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
         'GLS'   : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
         'GLSAR' : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
         'WLS'   : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
         'GLM'   : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], #glm uses t-distribution
         'RLM'   : ['coef', 'std err', 'z', 'P>|z|', alp + ' Conf. Interval']  #checke z
                   }
    params_stubs = xname
    params = self.params
    conf_int = self.conf_int(alpha)
    std_err = self.bse
    exog_len = lrange(len(xname))
    tstat = tstats[modeltype]
    prob_stat = prob_stats[modeltype]

    # Simpletable should be able to handle the formating
    params_data = lzip(["%#6.4g" % (params[i]) for i in exog_len],
                       ["%#6.4f" % (std_err[i]) for i in exog_len],
                       ["%#6.4f" % (tstat[i]) for i in exog_len],
                       ["%#6.4f" % (prob_stat[i]) for i in exog_len],
                       ["(%#5g, %#5g)" % tuple(conf_int[i]) for i in exog_len])
    parameter_table = SimpleTable(params_data,
                                  param_header[modeltype],
                                  params_stubs,
                                  title = None,
                                  txt_fmt = fmt_2, #gen_fmt,
                                  )

    #special table
    #-------------
    #TODO: exists in linear_model, what about other models
    #residual diagnostics


    #output options
    #--------------
    #TODO: JP the rest needs to be fixed, similar to summary in linear_model

    def ols_printer():
        """
        print summary table for ols models
        """
        table = str(general_table)+'\n'+str(parameter_table)
        return table

    def ols_to_csv():
        """
        exports ols summary data to csv
        """
        pass
    def glm_printer():
        table = str(general_table)+'\n'+str(parameter_table)
        return table
        pass

    printers  = {'OLS': ols_printer,
                 'GLM': glm_printer}

    if returns=='print':
        try:
            return printers[modeltype]()
        except KeyError:
            return printers['OLS']()
Пример #43
0
def summary_params_2d(result, extras=None, endog_names=None, exog_names=None,
                      title=None):
    '''create summary table of regression parameters with several equations

    This allows interleaving of parameters with bse and/or tvalues

    Parameters
    ----------
    result : result instance
        the result instance with params and attributes in extras
    extras : list of strings
        additional attributes to add below a parameter row, e.g. bse or tvalues
    endog_names : None or list of strings
        names for rows of the parameter array (multivariate endog)
    exog_names : None or list of strings
        names for columns of the parameter array (exog)
    alpha : float
        level for confidence intervals, default 0.95
    title : None or string

    Returns
    -------
    tables : list of SimpleTable
        this contains a list of all seperate Subtables
    table_all : SimpleTable
        the merged table with results concatenated for each row of the parameter
        array

    '''
    if endog_names is None:
        #TODO: note the [1:] is specific to current MNLogit
        endog_names = ['endog_%d' % i for i in
                            np.unique(result.model.endog)[1:]]
    if exog_names is None:
        exog_names = ['var%d' %i for i in range(len(result.params))]

    #TODO: check formatting options with different values
    #res_params = [['%10.4f'%item for item in row] for row in result.params]
    res_params = [[forg(item, prec=4) for item in row] for row in result.params]
    if extras: #not None or non-empty
        #maybe this should be a simple triple loop instead of list comprehension?
        #below_list = [[['%10s' % ('('+('%10.3f'%v).strip()+')')
        extras_list = [[['%10s' % ('(' + forg(v, prec=3).strip() + ')')
                                for v in col]
                                for col in getattr(result, what)]
                                for what in extras]
        data = lzip(res_params, *extras_list)
        data = [i for j in data for i in j]  #flatten
        stubs = lzip(endog_names, *[['']*len(endog_names)]*len(extras))
        stubs = [i for j in stubs for i in j] #flatten
        #return SimpleTable(data, headers=exog_names, stubs=stubs)
    else:
        data = res_params
        stubs = endog_names
#        return SimpleTable(data, headers=exog_names, stubs=stubs,
#                       data_fmts=['%10.4f'])

    import copy
    txt_fmt = copy.deepcopy(fmt_params)
    txt_fmt.update(dict(data_fmts = ["%s"]*result.params.shape[1]))
    return SimpleTable(data, headers=exog_names,
                             stubs=stubs,
                             title=title,
#                             data_fmts = ["%s"]),
                             txt_fmt = txt_fmt)
Пример #44
0
 def test_rmse(self):
     results = self.res1.results
     for i in range(len(results)):
         assert_almost_equal(results[i].mse_resid**.5,
                 eval('self.res2.rmse_'+str(i+1)), DECIMAL_6)
Пример #45
0
 def test_cum_irf_stderr(self):
     stderr = self.irf.cum_effect_stderr(orth=False)
     for i in range(1, 1 + len(self.lut.cum_irf_stderr)):
         assert_almost_equal(np.round(stderr[i], 3),
                             self.lut.cum_irf_stderr[i-1])
Пример #46
0
    def initialize(self, model):
        """
        Called on the first call to update

        `ilabels` is a list of n_i x n_i matrices containing integer
        labels that correspond to specific correlation parameters.
        Two elements of ilabels[i] with the same label share identical
        variance components.

        `designx` is a matrix, with each row containing dummy
        variables indicating which variance components are associated
        with the corresponding element of QY.
        """

        super(Nested, self).initialize(model)

        if self.model.weights is not None:
            warnings.warn(
                "weights not implemented for nested cov_struct, "
                "using unweighted covariance estimate", NotImplementedWarning)

        # A bit of processing of the nest data
        id_matrix = np.asarray(self.model.dep_data)
        if id_matrix.ndim == 1:
            id_matrix = id_matrix[:, None]
        self.id_matrix = id_matrix

        endog = self.model.endog_li
        designx, ilabels = [], []

        # The number of layers of nesting
        n_nest = self.id_matrix.shape[1]

        for i in range(self.model.num_group):
            ngrp = len(endog[i])
            glab = self.model.group_labels[i]
            rix = self.model.group_indices[glab]

            # Determine the number of common variance components
            # shared by each pair of observations.
            ix1, ix2 = np.tril_indices(ngrp, -1)
            ncm = (self.id_matrix[rix[ix1], :] == self.id_matrix[rix[ix2], :]
                   ).sum(1)

            # This is used to construct the working correlation
            # matrix.
            ilabel = np.zeros((ngrp, ngrp), dtype=np.int32)
            ilabel[[ix1, ix2]] = ncm + 1
            ilabel[[ix2, ix1]] = ncm + 1
            ilabels.append(ilabel)

            # This is used to estimate the variance components.
            dsx = np.zeros((len(ix1), n_nest + 1), dtype=np.float64)
            dsx[:, 0] = 1
            for k in np.unique(ncm):
                ii = np.flatnonzero(ncm == k)
                dsx[ii, 1:k + 1] = 1
            designx.append(dsx)

        self.designx = np.concatenate(designx, axis=0)
        self.ilabels = ilabels

        svd = np.linalg.svd(self.designx, 0)
        self.designx_u = svd[0]
        self.designx_s = svd[1]
        self.designx_v = svd[2].T
Пример #47
0
    def cdf(self, endog_predict=None, exog_predict=None):
        r"""
        Cumulative distribution function for the conditional density.

        Parameters
        ----------
        endog_predict: array_like, optional
            The evaluation dependent variables at which the cdf is estimated.
            If not specified the training dependent variables are used.
        exog_predict: array_like, optional
            The evaluation independent variables at which the cdf is estimated.
            If not specified the training independent variables are used.

        Returns
        -------
        cdf_est: array_like
            The estimate of the cdf.

        Notes
        -----
        For more details on the estimation see [2]_, and p.181 in [1]_.

        The multivariate conditional CDF for mixed data (continuous and
        ordered/unordered discrete) is estimated by:

        .. math::

            F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}}) W_{h}(X_{i},x)}{\widehat{\mu}(x)}

        where G() is the product kernel CDF estimator for the dependent (y)
        variable(s) and W() is the product kernel CDF estimator for the
        independent variable(s).

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Liu, R., Yang, L. "Kernel estimation of multivariate cumulative
                    distribution function." Journal of Nonparametric
                    Statistics (2008)
        """
        if endog_predict is None:
            endog_predict = self.endog
        else:
            endog_predict = _adjust_shape(endog_predict, self.k_dep)
        if exog_predict is None:
            exog_predict = self.exog
        else:
            exog_predict = _adjust_shape(exog_predict, self.k_indep)

        N_data_predict = np.shape(exog_predict)[0]
        cdf_est = np.empty(N_data_predict)
        for i in range(N_data_predict):
            mu_x = gpke(self.bw[self.k_dep:],
                        data=self.exog,
                        data_predict=exog_predict[i, :],
                        var_type=self.indep_type) / self.nobs
            mu_x = np.squeeze(mu_x)
            cdf_endog = gpke(self.bw[0:self.k_dep],
                             data=self.endog,
                             data_predict=endog_predict[i, :],
                             var_type=self.dep_type,
                             ckertype="gaussian_cdf",
                             ukertype="aitchisonaitken_cdf",
                             okertype='wangryzin_cdf',
                             tosum=False)

            cdf_exog = gpke(self.bw[self.k_dep:],
                            data=self.exog,
                            data_predict=exog_predict[i, :],
                            var_type=self.indep_type,
                            tosum=False)
            S = (cdf_endog * cdf_exog).sum(axis=0)
            cdf_est[i] = S / (self.nobs * mu_x)

        return cdf_est
Пример #48
0
    def imse(self, bw):
        r"""
        Returns the Integrated Mean Square Error for the unconditional KDE.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV: float
            The cross-validation objective function.

        Notes
        -----
        See p. 27 in [1]_ for details on how to handle the multivariate
        estimation with mixed data types see p.6 in [2]_.

        The formula for the cross-validation objective function is:

        .. math:: CV=\frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N}
            \bar{K}_{h}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n}
            \sum_{j=1,j\neq i}^{N}K_{h}(X_{i},X_{j})

        Where :math:`\bar{K}_{h}` is the multivariate product convolution
        kernel (consult [2]_ for mixed data types).

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
                with Categorical and Continuous Data." Working Paper. (2000)
        """
        #F = 0
        #for i in range(self.nobs):
        #    k_bar_sum = gpke(bw, data=-self.data,
        #                     data_predict=-self.data[i, :],
        #                     var_type=self.var_type,
        #                     ckertype='gauss_convolution',
        #                     okertype='wangryzin_convolution',
        #                     ukertype='aitchisonaitken_convolution')
        #    F += k_bar_sum
        ## there is a + because loo_likelihood returns the negative
        #return (F / self.nobs**2 + self.loo_likelihood(bw) * \
        #        2 / ((self.nobs) * (self.nobs - 1)))

        # The code below is equivalent to the commented-out code above.  It's
        # about 20% faster due to some code being moved outside the for-loops
        # and shared by gpke() and loo_likelihood().
        F = 0
        kertypes = dict(c=kernels.gaussian_convolution,
                        o=kernels.wang_ryzin_convolution,
                        u=kernels.aitchison_aitken_convolution)
        nobs = self.nobs
        data = -self.data
        var_type = self.var_type
        ix_cont = np.array([c == 'c' for c in var_type])
        _bw_cont_product = bw[ix_cont].prod()
        Kval = np.empty(data.shape)
        for i in range(nobs):
            for ii, vtype in enumerate(var_type):
                Kval[:, ii] = kertypes[vtype](bw[ii], data[:, ii], data[i, ii])

            dens = Kval.prod(axis=1) / _bw_cont_product
            k_bar_sum = dens.sum(axis=0)
            F += k_bar_sum  # sum of prod kernel over nobs

        kertypes = dict(c=kernels.gaussian,
                        o=kernels.wang_ryzin,
                        u=kernels.aitchison_aitken)
        LOO = LeaveOneOut(self.data)
        L = 0  # leave-one-out likelihood
        Kval = np.empty((data.shape[0] - 1, data.shape[1]))
        for i, X_not_i in enumerate(LOO):
            for ii, vtype in enumerate(var_type):
                Kval[:, ii] = kertypes[vtype](bw[ii], -X_not_i[:, ii],
                                              data[i, ii])
            dens = Kval.prod(axis=1) / _bw_cont_product
            L += dens.sum(axis=0)

        # CV objective function, eq. (2.4) of Ref. [3]
        return (F / nobs**2 - 2 * L / (nobs * (nobs - 1)))
def regression(x_name, y_name, maxlag, data=data_input):
    print("!!!!!!!!!start regression!!!!!!!!!")
    print(x_name)
    print(y_name)
    v = 0.1
    data.printSchema()
    # data.show(10)
    # print(data.count())
    dataFrame = data

    input_feature_name = []
    for lagnumber in range(1, maxlag + 1):
        newname = "{}_t-{}".format(x_name, lagnumber)
        input_feature_name.append(newname)

    print("input_feature_name are")
    print(input_feature_name)

    assembler_for_lag = VectorAssembler(
        inputCols=input_feature_name,
        outputCol="features")

    dt = DecisionTreeRegressor(featuresCol="features", labelCol='{}'.format(y_name), maxDepth=6, minInstancesPerNode=10,
                               seed=0)
    pipeline = Pipeline(stages=[assembler_for_lag, dt])
    model = pipeline.fit(dataFrame)
    predictions = model.transform(dataFrame)

    # now predictions is the new dataFrame instead of the original dataFrame
    predictions = predictions.withColumnRenamed("prediction", 'predicted_{}'.format(y_name))
    # print("predictions dataframe is ")
    # predictions.select('predicted_{}'.format(y_name), '{}'.format(y_name), "features").show(5)

    evaluator = RegressionEvaluator(
        labelCol='{}'.format(y_name), predictionCol='predicted_{}'.format(y_name), metricName="mse")
    mse = evaluator.evaluate(predictions)
    print("Mean Squared Error (MSE) on test data = %g" % mse)
    featureImportances = model.stages[1].featureImportances
    print("Feature Importance")
    print(featureImportances)

    y_hat = predictions.select('predicted_{}'.format(y_name))
    y_hat = y_hat.withColumn("yid", monotonically_increasing_id())
    # print(y_hat.count())

    # compute residual value of y, y-y_hat, the residual value is the y in next round of loop
    if y_name == x_name:
        # learning rate is not in model 0
        # dataFrame = dataFrame.join(y_hat, col("id") == (col("yid")+maxlag))
        dataFrame = dataFrame.join(y_hat, col("id") == col("yid"))
        residual = dataFrame['{}'.format(y_name)] - dataFrame['predicted_{}'.format(y_name)]
        dataFrame = dataFrame.withColumn("{}res{}".format(y_name, x_name), residual)
        # dataFrame.show(5)
        dataFrame = dataFrame.drop("yid")
        return_col = dataFrame.select("{}res{}".format(y_name, x_name))
        print("still round 1")
        # print(dataFrame.count())
    else:
        # apply leraning rate
        dataFrame = dataFrame.join(y_hat, col("id") == col("yid"))
        dataFrame = dataFrame.withColumn('v_predicted_{}'.format(y_name), col('predicted_{}'.format(y_name)) * v)
        residual = dataFrame['{}'.format(y_name)] - dataFrame['v_predicted_{}'.format(y_name)]
        dataFrame = dataFrame.withColumn("{}res{}".format(y_name, x_name), residual)
        dataFrame = dataFrame.drop("yid")
        return_col = dataFrame.select("{}res{}".format(y_name, x_name))
        print("after round 1 ")

    print("data for next step is ")

    return return_col, mse, featureImportances
def arma_acovf(ar, ma, nobs=10, sigma2=1, dtype=None):
    """
    Theoretical autocovariance function of ARMA process

    Parameters
    ----------
    ar : array_like, 1d
        coefficient for autoregressive lag polynomial, including zero lag
    ma : array_like, 1d
        coefficient for moving-average lag polynomial, including zero lag
    nobs : int
        number of terms (lags plus zero lag) to include in returned acovf
    sigma2 : float
        Variance of the innovation term.

    Returns
    -------
    acovf : array
        autocovariance of ARMA process given by ar, ma

    See Also
    --------
    arma_acf
    acovf

    References
    ----------
    .. [*] Brockwell, Peter J., and Richard A. Davis. 2009. Time Series:
        Theory and Methods. 2nd ed. 1991. New York, NY: Springer.
    """
    if dtype is None:
        dtype = np.common_type(np.array(ar), np.array(ma), np.array(sigma2))

    p = len(ar) - 1
    q = len(ma) - 1
    m = max(p, q) + 1

    if sigma2.real < 0:
        raise ValueError('Must have positive innovation variance.')

    # Short-circuit for trivial corner-case
    if p == q == 0:
        out = np.zeros(nobs, dtype=dtype)
        out[0] = sigma2
        return out

    # Get the moving average representation coefficients that we need
    ma_coeffs = arma2ma(ar, ma, lags=m)

    # Solve for the first m autocovariances via the linear system
    # described by (BD, eq. 3.3.8)
    A = np.zeros((m, m), dtype=dtype)
    b = np.zeros((m, 1), dtype=dtype)
    # We need a zero-right-padded version of ar params
    tmp_ar = np.zeros(m, dtype=dtype)
    tmp_ar[:p + 1] = ar
    for k in range(m):
        A[k, :(k + 1)] = tmp_ar[:(k + 1)][::-1]
        A[k, 1:m - k] += tmp_ar[(k + 1):m]
        b[k] = sigma2 * np.dot(ma[k:q + 1], ma_coeffs[:max((q + 1 - k), 0)])
    acovf = np.zeros(max(nobs, m), dtype=dtype)
    acovf[:m] = np.linalg.solve(A, b)[:, 0]

    # Iteratively apply (BD, eq. 3.3.9) to solve for remaining autocovariances
    if nobs > m:
        zi = signal.lfiltic([1], ar, acovf[:m:][::-1])
        acovf[m:] = signal.lfilter([1],
                                   ar,
                                   np.zeros(nobs - m, dtype=dtype),
                                   zi=zi)[0]

    return acovf[:nobs]
    def test_zero_collinear(self):
        # not completely generic yet
        if isinstance(self.results.model, (sm.GEE)):
            pytest.skip('Not completely generic yet')

        use_start_params = not isinstance(self.results.model,
                                          (sm.RLM, sm.OLS, sm.WLS, sm.GLM))
        self.use_start_params = use_start_params  # attach for _get_constrained
        keep_index = list(range(self.results.model.exog.shape[1]))
        # index for params might include extra params
        keep_index_p = list(range(self.results.params.shape[0]))
        drop_index = []
        for i in drop_index:
            del keep_index[i]
            del keep_index_p[i]

        keep_index_p = list(range(self.results.params.shape[0]))

        # create collinear model
        mod2 = self.results.model
        mod_cls = mod2.__class__
        init_kwds = mod2._get_init_kwds()
        ex = np.column_stack((mod2.exog, mod2.exog))
        mod = mod_cls(mod2.endog, ex, **init_kwds)

        keep_index = list(range(self.results.model.exog.shape[1]))
        keep_index_p = list(range(self.results.model.exog.shape[1]))
        k_vars = ex.shape[1]
        k_extra = 0
        if hasattr(mod, 'k_extra') and mod.k_extra > 0:
            keep_index_p += list(range(k_vars, k_vars + mod.k_extra))
            k_extra = mod.k_extra

        cov_types = ['nonrobust', 'HC0']

        for cov_type in cov_types:
            # Note: for RLM we only check default when cov_type is 'nonrobust'
            # cov_type is otherwise ignored
            if cov_type != 'nonrobust' and (isinstance(self.results.model,
                                                       sm.RLM)):
                return

            if use_start_params:
                start_params = np.zeros(k_vars + k_extra)
                method = self.results.mle_settings['optimizer']
                # string in `method` is not mutable, so no need for copy
                sp = self.results.mle_settings['start_params'].copy()
                if self.transform_index is not None:
                    # work around internal transform_params, currently in NB
                    sp[self.transform_index] = np.exp(sp[self.transform_index])

                start_params[keep_index_p] = sp
                res1 = mod._fit_collinear(cov_type=cov_type,
                                          start_params=start_params,
                                          method=method,
                                          disp=0)
                if cov_type != 'nonrobust':
                    # reestimate original model to get robust cov
                    res2 = self.results.model.fit(cov_type=cov_type,
                                                  start_params=sp,
                                                  method=method,
                                                  disp=0)
            else:
                # more special casing RLM
                if (isinstance(self.results.model, (sm.RLM))):
                    res1 = mod._fit_collinear()
                else:
                    res1 = mod._fit_collinear(cov_type=cov_type)
                if cov_type != 'nonrobust':
                    # reestimate original model to get robust cov
                    res2 = self.results.model.fit(cov_type=cov_type)

            if cov_type == 'nonrobust':
                res2 = self.results

            # check fit optimizer arguments, if mle_settings is available
            if hasattr(res2, 'mle_settings'):
                assert_equal(
                    res1.results_constrained.mle_settings['optimizer'],
                    res2.mle_settings['optimizer'])
                if 'start_params' in res2.mle_settings:
                    spc = res1.results_constrained.mle_settings['start_params']
                    assert_allclose(spc,
                                    res2.mle_settings['start_params'],
                                    rtol=1e-10,
                                    atol=1e-20)
                    assert_equal(res1.mle_settings['optimizer'],
                                 res2.mle_settings['optimizer'])
                    assert_allclose(res1.mle_settings['start_params'],
                                    res2.mle_settings['start_params'],
                                    rtol=1e-10,
                                    atol=1e-20)

            # Poisson has reduced precision in params, difficult optimization?
            assert_allclose(res1.params[keep_index_p], res2.params, rtol=1e-6)
            assert_allclose(res1.params[drop_index], 0, rtol=1e-10)
            assert_allclose(res1.bse[keep_index_p], res2.bse, rtol=1e-8)
            assert_allclose(res1.bse[drop_index], 0, rtol=1e-10)
            assert_allclose(res1.tvalues[keep_index_p],
                            res2.tvalues,
                            rtol=5e-8)
            assert_allclose(res1.pvalues[keep_index_p],
                            res2.pvalues,
                            rtol=1e-6,
                            atol=1e-30)

            if hasattr(res1, 'resid'):
                # discrete models, Logit don't have `resid` yet
                assert_allclose(res1.resid, res2.resid, rtol=1e-5, atol=1e-10)

            ex = res1.model.exog.mean(0)
            predicted1 = res1.predict(ex, **self.predict_kwds)
            predicted2 = res2.predict(ex[keep_index], **self.predict_kwds)
            assert_allclose(predicted1, predicted2, rtol=1e-8, atol=1e-11)

            ex = res1.model.exog[:5]
            kwds = getattr(self, 'predict_kwds_5', {})

            predicted1 = res1.predict(ex, **kwds)
            predicted2 = res2.predict(ex[:, keep_index], **kwds)
            assert_allclose(predicted1, predicted2, rtol=1e-8, atol=1e-11)
Пример #52
0
def kdesum(x, axis=0):
    return np.asarray([np.sum(x[i] - x, axis) for i in range(len(x))])
Пример #53
0
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False):
    """
    Create 2d array of lags

    Parameters
    ----------
    x : array_like, 1d or 2d
        data; if 2d, observation in rows and variables in columns
    maxlag : int
        all lags from zero to maxlag are included
    trim : str {'forward', 'backward', 'both', 'none'} or None
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none', None : no trimming of observations
    original : str {'ex','sep','in'}
        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations
    y : 2d array, optional
        Only returned if original == 'sep'

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend index
    values.
    """
    # TODO:  allow list of lags additional to maxlag
    is_pandas = _is_using_pandas(x, None) and use_pandas
    trim = 'none' if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ('none', 'backward'):
        raise ValueError("trim cannot be 'none' or 'forward' when used on "
                         "Series or DataFrames")

    xa = np.asarray(x)
    dropidx = 0
    if xa.ndim == 1:
        xa = xa[:, None]
    nobs, nvar = xa.shape
    if original in ['ex', 'sep']:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
        nvar * (maxlag - k):nvar * (maxlag - k + 1)] = xa

    if trim in ('none', 'forward'):
        startobs = 0
    elif trim in ('backward', 'both'):
        startobs = maxlag
    else:
        raise ValueError('trim option not valid')

    if trim in ('none', 'backward'):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + '.L.' + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ('sep', 'ex'):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, 1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == 'sep':
            leads = lm[startobs:stopobs, :dropidx]

    if original == 'sep':
        return lags, leads
    else:
        return lags
Пример #54
0
 def test_llf(self):
     results = self.res1.results
     assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2)
     for i in range(len(results)):
         assert_almost_equal(results[i].llf,
                 eval('self.res2.llf_'+str(i+1)), DECIMAL_2)
Пример #55
0
def summary_col(results,
                float_format='%.4f',
                model_names=(),
                stars=False,
                info_dict=None,
                regressor_order=(),
                drop_omitted=False):
    """
    Summarize multiple results instances side-by-side (coefs and SEs)

    Parameters
    ----------
    results : statsmodels results instance or list of result instances
    float_format : string, optional
        float format for coefficients and standard errors
        Default : '%.4f'
    model_names : list of strings, optional
        Must have same length as the number of results. If the names are not
        unique, a roman number will be appended to all model names
    stars : bool
        print significance stars
    info_dict : dict
        dict of functions to be applied to results instances to retrieve
        model info. To use specific information for different models, add a
        (nested) info_dict with model name as the key.
        Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would
        only show `R2` for OLS regression models, but additionally `N` for
        all other results.
        Default : None (use the info_dict specified in
        result.default_model_infos, if this property exists)
    regressor_order : list of strings, optional
        list of names of the regressors in the desired order. All regressors
        not specified will be appended to the end of the list.
    drop_omitted : bool, optional
        Includes regressors that are not specified in regressor_order. If False,
        regressors not specified will be appended to end of the list. If True,
        only regressors in regressors_list will be included.
    """

    if not isinstance(results, list):
        results = [results]

    cols = [
        _col_params(x, stars=stars, float_format=float_format) for x in results
    ]

    # Unique column names (pandas has problems merging otherwise)
    if model_names:
        colnames = _make_unique(model_names)
    else:
        colnames = _make_unique([x.columns[0] for x in cols])
    for i in range(len(cols)):
        cols[i].columns = [colnames[i]]

    merg = lambda x, y: x.merge(
        y, how='outer', right_index=True, left_index=True)
    summ = reduce(merg, cols)

    if regressor_order:
        varnames = summ.index.get_level_values(0).tolist()
        ordered = [x for x in regressor_order if x in varnames]
        unordered = [x for x in varnames if x not in regressor_order + ['']]
        order = ordered + list(np.unique(unordered))

        f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], [])
        summ.index = f(pd.unique(varnames))
        summ = summ.reindex(f(order))
        summ.index = [x[:-4] for x in summ.index]
        if drop_omitted:
            summ = summ.loc[regressor_order]

    idx = pd.Series(lrange(summ.shape[0])) % 2 == 1
    summ.index = np.where(idx, '', summ.index.get_level_values(0))

    # add infos about the models.
    if info_dict:
        cols = [
            _col_info(x, info_dict.get(x.model.__class__.__name__, info_dict))
            for x in results
        ]
    else:
        cols = [
            _col_info(x, getattr(x, "default_model_infos", None))
            for x in results
        ]
    # use unique column names, otherwise the merge will not succeed
    for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])):
        df.columns = [name]
    merg = lambda x, y: x.merge(
        y, how='outer', right_index=True, left_index=True)
    info = reduce(merg, cols)
    dat = pd.DataFrame(np.vstack([summ, info]))  # pd.concat better, but error
    dat.columns = summ.columns
    dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
    summ = dat

    summ = summ.fillna('')

    smry = Summary()
    smry._merge_latex = True
    smry.add_df(summ, header=True, align='l')
    smry.add_text('Standard errors in parentheses.')
    if stars:
        smry.add_text('* p<.1, ** p<.05, ***p<.01')

    return smry
Пример #56
0
 def test_rsquared(self):
     results = self.res1.results
     for i in range(len(results)):
         assert_almost_equal(results[i].rsquared,
                 eval('self.res2.rsquared_'+str(i+1)), DECIMAL_3)
Пример #57
0
def _hierarchical_split(count_dict, horizontal=True, gap=0.05):
    """
    Split a square in a hierarchical way given a contingency table.

    Hierarchically split the unit square in alternate directions
    in proportion to the subdivision contained in the contingency table
    count_dict.  This is the function that actually perform the tiling
    for the creation of the mosaic plot.  If the gap array has been specified
    it will insert a corresponding amount of space (proportional to the
    unit lenght), while retaining the proportionality of the tiles.

    Parameters
    ----------
    count_dict : dict
        Dictionary containing the contingency table.
        Each category should contain a non-negative number
        with a tuple as index.  It expects that all the combination
        of keys to be representes; if that is not true, will
        automatically consider the missing values as 0
    horizontal : bool
        The starting direction of the split (by default along
        the horizontal axis)
    gap : float or array of floats
        The list of gaps to be applied on each subdivision.
        If the lenght of the given array is less of the number
        of subcategories (or if it's a single number) it will extend
        it with exponentially decreasing gaps

    Returns
    ----------
    base_rect : dict
        A dictionary containing the result of the split.
        To each key is associated a 4-tuple of coordinates
        that are required to create the corresponding rectangle:

            0 - x position of the lower left corner
            1 - y position of the lower left corner
            2 - width of the rectangle
            3 - height of the rectangle
    """
    # this is the unit square that we are going to divide
    base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))])
    # get the list of each possible value for each level
    categories_levels = _categories_level(list(iterkeys(count_dict)))
    L = len(categories_levels)

    # recreate the gaps vector starting from an int
    if not np.iterable(gap):
        gap = [gap / 1.5**idx for idx in range(L)]
    # extend if it's too short
    if len(gap) < L:
        last = gap[-1]
        gap = list(*gap) + [last / 1.5**idx for idx in range(L)]
    # trim if it's too long
    gap = gap[:L]
    # put the count dictionay in order for the keys
    # this will allow some code simplification
    count_ordered = OrderedDict([(k, count_dict[k])
                                 for k in list(product(*categories_levels))])
    for cat_idx, cat_enum in enumerate(categories_levels):
        # get the partial key up to the actual level
        base_keys = list(product(*categories_levels[:cat_idx]))
        for key in base_keys:
            # for each partial and each value calculate how many
            # observation we have in the counting dictionary
            part_count = [
                _reduce_dict(count_ordered, key + (partial, ))
                for partial in cat_enum
            ]
            # reduce the gap for subsequents levels
            new_gap = gap[cat_idx]
            # split the given subkeys in the rectangle dictionary
            base_rect = _key_splitting(base_rect, cat_enum, part_count, key,
                                       horizontal, new_gap)
        horizontal = not horizontal
    return base_rect
Пример #58
0
    def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
            max_iter=1000, p_tol=1e-6, **kwargs):
        '''Solve by Iterative Weighted Least Squares

        Parameters
        ----------
        q : float
            Quantile must be between 0 and 1
        vcov : string, method used to calculate the variance-covariance matrix
            of the parameters. Default is ``robust``:

            - robust : heteroskedasticity robust standard errors (as suggested
              in Greene 6th edition)
            - iid : iid errors (as in Stata 12)

        kernel : string, kernel to use in the kernel density estimation for the
            asymptotic covariance matrix:

            - epa: Epanechnikov
            - cos: Cosine
            - gau: Gaussian
            - par: Parzene

        bandwidth: string, Bandwidth selection method in kernel density
            estimation for asymptotic covariance estimate (full
            references in QuantReg docstring):

            - hsheather: Hall-Sheather (1988)
            - bofinger: Bofinger (1975)
            - chamberlain: Chamberlain (1994)
        '''

        if q < 0 or q > 1:
            raise Exception('p must be between 0 and 1')

        kern_names = ['biw', 'cos', 'epa', 'gau', 'par']
        if kernel not in kern_names:
            raise Exception("kernel must be one of " + ', '.join(kern_names))
        else:
            kernel = kernels[kernel]

        if bandwidth == 'hsheather':
            bandwidth = hall_sheather
        elif bandwidth == 'bofinger':
            bandwidth = bofinger
        elif bandwidth == 'chamberlain':
            bandwidth = chamberlain
        else:
            raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'")

        endog = self.endog
        exog = self.exog
        nobs = self.nobs
        exog_rank = np_matrix_rank(self.exog)
        self.rank = exog_rank
        self.df_model = float(self.rank - self.k_constant)
        self.df_resid = self.nobs - self.rank
        n_iter = 0
        xstar = exog

        beta = np.ones(exog_rank)
        # TODO: better start, initial beta is used only for convergence check

        # Note the following doesn't work yet,
        # the iteration loop always starts with OLS as initial beta
#        if start_params is not None:
#            if len(start_params) != rank:
#                raise ValueError('start_params has wrong length')
#            beta = start_params
#        else:
#            # start with OLS
#            beta = np.dot(np.linalg.pinv(exog), endog)

        diff = 10
        cycle = False

        history = dict(params = [], mse=[])
        while n_iter < max_iter and diff > p_tol and not cycle:
            n_iter += 1
            beta0 = beta
            xtx = np.dot(xstar.T, exog)
            xty = np.dot(xstar.T, endog)
            beta = np.dot(pinv(xtx), xty)
            resid = endog - np.dot(exog, beta)

            mask = np.abs(resid) < .000001
            resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001
            resid = np.where(resid < 0, q * resid, (1-q) * resid)
            resid = np.abs(resid)
            xstar = exog / resid[:, np.newaxis]
            diff = np.max(np.abs(beta - beta0))
            history['params'].append(beta)
            history['mse'].append(np.mean(resid*resid))

            if (n_iter >= 300) and (n_iter % 100 == 0):
                # check for convergence circle, shouldn't happen
                for ii in range(2, 10):
                    if np.all(beta == history['params'][-ii]):
                        cycle = True
                        warnings.warn("Convergence cycle detected", ConvergenceWarning)
                        break

        if n_iter == max_iter:
            warnings.warn("Maximum number of iterations (" + str(max_iter) + 
                          ") reached.", IterationLimitWarning)

        e = endog - np.dot(exog, beta)
        # Greene (2008, p.407) writes that Stata 6 uses this bandwidth:
        # h = 0.9 * np.std(e) / (nobs**0.2)
        # Instead, we calculate bandwidth as in Stata 12
        iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25)
        h = bandwidth(nobs, q)
        h = min(np.std(endog),
                iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h))

        fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h))

        if vcov == 'robust':
            d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
            xtxi = pinv(np.dot(exog.T, exog))
            xtdx = np.dot(exog.T * d[np.newaxis, :], exog)
            vcov = chain_dot(xtxi, xtdx, xtxi)
        elif vcov == 'iid':
            vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog))
        else:
            raise Exception("vcov must be 'robust' or 'iid'")

        lfit = QuantRegResults(self, beta, normalized_cov_params=vcov)

        lfit.q = q
        lfit.iterations = n_iter
        lfit.sparsity = 1. / fhat0
        lfit.bandwidth = h
        lfit.history = history

        return RegressionResultsWrapper(lfit)
Пример #59
0
def _create_labels(rects, horizontal, ax, rotation):
    """find the position of the label for each value of each category

    right now it supports only up to the four categories

    ax: the axis on which the label should be applied
    rotation: the rotation list for each side
    """
    categories = _categories_level(list(iterkeys(rects)))
    if len(categories) > 4:
        msg = ("maximum of 4 level supported for axes labeling..and 4"
               "is alreay a lot of level, are you sure you need them all?")
        raise NotImplementedError(msg)
    labels = {}
    #keep it fixed as will be used a lot of times
    items = list(iteritems(rects))
    vertical = not horizontal

    #get the axis ticks and labels locator to put the correct values!
    ax2 = ax.twinx()
    ax3 = ax.twiny()
    #this is the order of execution for horizontal disposition
    ticks_pos = [ax.set_xticks, ax.set_yticks, ax3.set_xticks, ax2.set_yticks]
    ticks_lab = [
        ax.set_xticklabels, ax.set_yticklabels, ax3.set_xticklabels,
        ax2.set_yticklabels
    ]
    #for the vertical one, rotate it by one
    if vertical:
        ticks_pos = ticks_pos[1:] + ticks_pos[:1]
        ticks_lab = ticks_lab[1:] + ticks_lab[:1]
    #clean them
    for pos, lab in zip(ticks_pos, ticks_lab):
        pos([])
        lab([])
    #for each level, for each value in the level, take the mean of all
    #the sublevel that correspond to that partial key
    for level_idx, level in enumerate(categories):
        #this dictionary keep the labels only for this level
        level_ticks = dict()
        for value in level:
            #to which level it should refer to get the preceding
            #values of labels? it's rather a tricky question...
            #this is dependent on the side. It's a very crude management
            #but I couldn't think a more general way...
            if horizontal:
                if level_idx == 3:
                    index_select = [-1, -1, -1]
                else:
                    index_select = [+0, -1, -1]
            else:
                if level_idx == 3:
                    index_select = [+0, -1, +0]
                else:
                    index_select = [-1, -1, -1]
            #now I create the base key name and append the current value
            #It will search on all the rects to find the corresponding one
            #and use them to evaluate the mean position
            basekey = tuple(categories[i][index_select[i]]
                            for i in range(level_idx))
            basekey = basekey + (value, )
            subset = dict(
                (k, v) for k, v in items if basekey == k[:level_idx + 1])
            #now I extract the center of all the tiles and make a weighted
            #mean of all these center on the area of the tile
            #this should give me the (more or less) correct position
            #of the center of the category

            vals = list(itervalues(subset))
            W = sum(w * h for (x, y, w, h) in vals)
            x_lab = sum((x + w / 2.0) * w * h / W for (x, y, w, h) in vals)
            y_lab = sum((y + h / 2.0) * w * h / W for (x, y, w, h) in vals)
            #now base on the ordering, select which position to keep
            #needs to be written in a more general form of 4 level are enough?
            #should give also the horizontal and vertical alignment
            side = (level_idx + vertical) % 4
            level_ticks[value] = y_lab if side % 2 else x_lab
        #now we add the labels of this level to the correct axis

        ticks_pos[level_idx](list(itervalues(level_ticks)))
        ticks_lab[level_idx](list(iterkeys(level_ticks)),
                             rotation=rotation[level_idx])
    return labels
Пример #60
0
    def sirf_errband_mc(self,
                        orth=False,
                        repl=1000,
                        T=10,
                        signif=0.05,
                        seed=None,
                        burn=100,
                        cum=False):
        """
        Compute Monte Carlo integrated error bands assuming normally
        distributed for impulse response functions

        Parameters
        ----------
        orth: bool, default False
            Compute orthoganalized impulse response error bands
        repl: int
            number of Monte Carlo replications to perform
        T: int, default 10
            number of impulse response periods
        signif: float (0 < signif <1)
            Significance level for error bars, defaults to 95% CI
        seed: int
            np.random.seed for replications
        burn: int
            number of initial observations to discard for simulation
        cum: bool, default False
            produce cumulative irf error bands

        Notes
        -----
        Lutkepohl (2005) Appendix D

        Returns
        -------
        Tuple of lower and upper arrays of ma_rep monte carlo standard errors

        """
        neqs = self.neqs
        mean = self.mean()
        k_ar = self.k_ar
        coefs = self.coefs
        sigma_u = self.sigma_u
        intercept = self.intercept
        df_model = self.df_model
        nobs = self.nobs

        ma_coll = np.zeros((repl, T + 1, neqs, neqs))
        A = self.A
        B = self.B
        A_mask = self.A_mask
        B_mask = self.B_mask
        A_pass = np.zeros(A.shape, dtype='|S1')
        B_pass = np.zeros(B.shape, dtype='|S1')
        A_pass[~A_mask] = A[~A_mask]
        B_pass[~B_mask] = B[~B_mask]
        A_pass[A_mask] = 'E'
        B_pass[B_mask] = 'E'
        if A_mask.sum() == 0:
            s_type = 'B'
        elif B_mask.sum() == 0:
            s_type = 'A'
        else:
            s_type = 'AB'
        g_list = []

        for i in range(repl):
            #discard first hundred to correct for starting bias
            sim = util.varsim(coefs, intercept, sigma_u, steps=nobs + burn)
            sim = sim[burn:]
            if cum == True:
                if i < 10:
                    sol = SVAR(sim, svar_type=s_type, A=A_pass,
                               B=B_pass).fit(maxlags=k_ar)
                    g_list.append(np.append(sol.A[sol.A_mask].\
                                            tolist(),
                                            sol.B[sol.B_mask].\
                                            tolist()))
                    ma_coll[i] = sol.svar_ma_rep(maxn=T).cumsum(axis=0)
                elif i >= 10:
                    if i == 10:
                        mean_AB = np.mean(g_list, axis=0)
                        split = len(A_pass[A_mask])
                        opt_A = mean_AB[:split]
                        opt_A = mean_AB[split:]
                    ma_coll[i] = SVAR(sim, svar_type=s_type, A=A_pass,
                                 B=B_pass).fit(maxlags=k_ar,\
                                 A_guess=opt_A, B_guess=opt_B).\
                                 svar_ma_rep(maxn=T).cumsum(axis=0)

            elif cum == False:
                if i < 10:
                    sol = SVAR(sim, svar_type=s_type, A=A_pass,
                               B=B_pass).fit(maxlags=k_ar)
                    g_list.append(
                        np.append(sol.A[A_mask].tolist(),
                                  sol.B[B_mask].tolist()))
                    ma_coll[i] = sol.svar_ma_rep(maxn=T)
                elif i >= 10:
                    if i == 10:
                        mean_AB = np.mean(g_list, axis=0)
                        split = len(A[A_mask])
                        opt_A = mean_AB[:split]
                        opt_B = mean_AB[split:]
                    ma_coll[i] = SVAR(sim, svar_type=s_type, A=A_pass,
                                 B=B_pass).fit(maxlags=k_ar,\
                                 A_guess = opt_A, B_guess = opt_B).\
                                 svar_ma_rep(maxn=T)

        ma_sort = np.sort(ma_coll, axis=0)  #sort to get quantiles
        index = round(signif / 2 * repl) - 1, round(
            (1 - signif / 2) * repl) - 1
        lower = ma_sort[index[0], :, :, :]
        upper = ma_sort[index[1], :, :, :]
        return lower, upper