Exemplo n.º 1
0
    def pdf(self, endog_predict=None, exog_predict=None):
        r"""
        Evaluate the probability density function.

        Parameters
        ----------
        endog_predict: array_like, optional
            Evaluation data for the dependent variables.  If unspecified, the
            training data is used.
        exog_predict: array_like, optional
            Evaluation data for the independent variables.

        Returns
        -------
        pdf: array_like
            The value of the probability density at `endog_predict` and `exog_predict`.

        Notes
        -----
        The formula for the conditional probability density is:

        .. math:: f(X|Y)=\frac{f(X,Y)}{f(Y)}

        with

        .. math:: f(X)=\prod_{s=1}^{q}h_{s}^{-1}k
                            \left(\frac{X_{is}-X_{js}}{h_{s}}\right)

        where :math:`k` is the appropriate kernel for each variable.
        """
        if endog_predict is None:
            endog_predict = self.endog
        else:
            endog_predict = _adjust_shape(endog_predict, self.k_dep)
        if exog_predict is None:
            exog_predict = self.exog
        else:
            exog_predict = _adjust_shape(exog_predict, self.k_indep)

        pdf_est = []
        data_predict = np.column_stack((endog_predict, exog_predict))
        for i in xrange(np.shape(data_predict)[0]):
            f_yx = gpke(self.bw,
                        data=self.data,
                        data_predict=data_predict[i, :],
                        var_type=(self.dep_type + self.indep_type))
            f_x = gpke(self.bw[self.k_dep:],
                       data=self.exog,
                       data_predict=exog_predict[i, :],
                       var_type=self.indep_type)
            pdf_est.append(f_yx / f_x)

        return np.squeeze(pdf_est)
Exemplo n.º 2
0
    def pdf(self, endog_predict=None, exog_predict=None):
        r"""
        Evaluate the probability density function.

        Parameters
        ----------
        endog_predict: array_like, optional
            Evaluation data for the dependent variables.  If unspecified, the
            training data is used.
        exog_predict: array_like, optional
            Evaluation data for the independent variables.

        Returns
        -------
        pdf: array_like
            The value of the probability density at `endog_predict` and `exog_predict`.

        Notes
        -----
        The formula for the conditional probability density is:

        .. math:: f(X|Y)=\frac{f(X,Y)}{f(Y)}

        with

        .. math:: f(X)=\prod_{s=1}^{q}h_{s}^{-1}k
                            \left(\frac{X_{is}-X_{js}}{h_{s}}\right)

        where :math:`k` is the appropriate kernel for each variable.
        """
        if endog_predict is None:
            endog_predict = self.endog
        else:
            endog_predict = _adjust_shape(endog_predict, self.k_dep)
        if exog_predict is None:
            exog_predict = self.exog
        else:
            exog_predict = _adjust_shape(exog_predict, self.k_indep)

        pdf_est = []
        data_predict = np.column_stack((endog_predict, exog_predict))
        for i in xrange(np.shape(data_predict)[0]):
            f_yx = gpke(self.bw, data=self.data,
                        data_predict=data_predict[i, :],
                        var_type=(self.dep_type + self.indep_type))
            f_x = gpke(self.bw[self.k_dep:], data=self.exog,
                       data_predict=exog_predict[i, :],
                       var_type=self.indep_type)
            pdf_est.append(f_yx / f_x)

        return np.squeeze(pdf_est)
Exemplo n.º 3
0
    def _est_loc_constant(self, bw, endog, exog, data_predict):
        """
        Local constant estimator of g(x) in the regression
        y = g(x) + e

        Parameters
        ----------
        bw : array_like
            Array of bandwidth value(s).
        endog : 1D array_like
            The dependent variable.
        exog : 1D or 2D array_like
            The independent variable(s).
        data_predict : 1D or 2D array_like
            The point(s) at which the density is estimated.

        Returns
        -------
        G : ndarray
            The value of the conditional mean at `data_predict`.
        B_x : ndarray
            The marginal effects.

        """
        ker_x = gpke(bw, data=exog, data_predict=data_predict,
                     var_type=self.var_type,
                     #ukertype='aitchison_aitken_reg',
                     #okertype='wangryzin_reg',
                     tosum=False)
        ker_x = np.reshape(ker_x, np.shape(endog))
        G_numer = (ker_x * endog).sum(axis=0)
        G_denom = ker_x.sum(axis=0)
        G = G_numer / G_denom
        nobs = exog.shape[0]
        f_x = G_denom / float(nobs)
        ker_xc = gpke(bw, data=exog, data_predict=data_predict,
                      var_type=self.var_type,
                      ckertype='d_gaussian',
                      #okertype='wangryzin_reg',
                      tosum=False)

        ker_xc = ker_xc[:, np.newaxis]
        d_mx = -(endog * ker_xc).sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
        d_fx = -ker_xc.sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
        B_x = d_mx / f_x - G * d_fx / f_x
        B_x = (G_numer * d_fx - G_denom * d_mx) / (G_denom**2)
        #B_x = (f_x * d_mx - m_x * d_fx) / (f_x ** 2)
        return G, B_x
Exemplo n.º 4
0
    def pdf(self, data_predict=None):
        r"""
        Evaluate the probability density function.

        Parameters
        ----------
        data_predict: array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        pdf_est: array_like
            Probability density function evaluated at `data_predict`.

        Notes
        -----
        The probability density is given by the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        """
        if data_predict is None:
            data_predict = self.data
        else:
            data_predict = _adjust_shape(data_predict, self.k_vars)

        pdf_est = []
        for i in xrange(np.shape(data_predict)[0]):
            pdf_est.append(gpke(self.bw, data=self.data,
                                data_predict=data_predict[i, :],
                                var_type=self.var_type) / self.nobs)

        pdf_est = np.squeeze(pdf_est)
        return pdf_est
Exemplo n.º 5
0
    def loo_likelihood(self, bw, func=lambda x: x):
        r"""
        Returns the leave-one-out likelihood function.

        The leave-one-out likelihood function for the unconditional KDE.

        Parameters
        ----------
        bw: array_like
            The value for the bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Notes
        -----
        The leave-one-out kernel estimator of :math:`f_{-i}` is:

        .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
                    \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})

        where :math:`K_{h}` represents the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        """
        LOO = LeaveOneOut(self.data)
        L = 0
        for i, X_not_i in enumerate(LOO):
            f_i = gpke(bw, data=-X_not_i, data_predict=-self.data[i, :],
                       var_type=self.var_type)
            L += func(f_i)

        return -L
Exemplo n.º 6
0
    def pdf(self, data_predict=None):
        r"""
        Evaluate the probability density function.

        Parameters
        ----------
        data_predict: array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        pdf_est: array_like
            Probability density function evaluated at `data_predict`.

        Notes
        -----
        The probability density is given by the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        """
        if data_predict is None:
            data_predict = self.data
        else:
            data_predict = _adjust_shape(data_predict, self.k_vars)

        pdf_est = []
        for i in xrange(np.shape(data_predict)[0]):
            pdf_est.append(gpke(self.bw, data=self.data,
                                data_predict=data_predict[i, :],
                                var_type=self.var_type) / self.nobs)

        pdf_est = np.squeeze(pdf_est)
        return pdf_est
Exemplo n.º 7
0
    def loo_likelihood(self, bw, func=lambda x: x):
        r"""
        Returns the leave-one-out likelihood function.

        The leave-one-out likelihood function for the unconditional KDE.

        Parameters
        ----------
        bw: array_like
            The value for the bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Notes
        -----
        The leave-one-out kernel estimator of :math:`f_{-i}` is:

        .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
                    \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})

        where :math:`K_{h}` represents the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        """
        LOO = LeaveOneOut(self.data)
        L = 0
        for i, X_not_i in enumerate(LOO):
            f_i = gpke(bw, data=-X_not_i, data_predict=-self.data[i, :],
                       var_type=self.var_type)
            L += func(f_i)

        return -L
Exemplo n.º 8
0
    def _est_loc_linear(self, bw, endog, exog, data_predict):
        """
        Local linear estimator of g(x) in the regression ``y = g(x) + e``.

        Parameters
        ----------
        bw: array_like
            Vector of bandwidth value(s).
        endog: 1D array_like
            The dependent variable.
        exog: 1D or 2D array_like
            The independent variable(s).
        data_predict: 1D array_like of length K, where K is the number of variables.
            The point at which the density is estimated.

        Returns
        -------
        D_x: array_like
            The value of the conditional mean at `data_predict`.

        Notes
        -----
        See p. 81 in [1] and p.38 in [2] for the formulas.
        Unlike other methods, this one requires that `data_predict` be 1D.

        """
        nobs, k_vars = exog.shape
        ker = gpke(bw, data=exog, data_predict=data_predict,
                   var_type=self.var_type,
                   #ukertype='aitchison_aitken_reg',
                   #okertype='wangryzin_reg',
                   tosum=False) / float(nobs)
        # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
        # See also p. 38 in [2]
        #ix_cont = np.arange(self.k_vars)  # Use all vars instead of continuous only
        # Note: because ix_cont was defined here such that it selected all
        # columns, I removed the indexing with it from exog/data_predict.

        # Convert ker to a 2-D array to make matrix operations below work
        ker = ker[:, np.newaxis]

        M12 = exog - data_predict
        M22 = np.dot(M12.T, M12 * ker)
        M12 = (M12 * ker).sum(axis=0)
        M = np.empty((k_vars + 1, k_vars + 1))
        M[0, 0] = ker.sum()
        M[0, 1:] = M12
        M[1:, 0] = M12
        M[1:, 1:] = M22

        ker_endog = ker * endog
        V = np.empty((k_vars + 1, 1))
        V[0, 0] = ker_endog.sum()
        V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)

        mean_mfx = np.dot(np.linalg.pinv(M), V)
        mean = mean_mfx[0]
        mfx = mean_mfx[1:, :]
        return mean, mfx
Exemplo n.º 9
0
    def _est_loc_linear(self, bw, endog, exog, data_predict, W):
        """
        Local linear estimator of g(x) in the regression ``y = g(x) + e``.

        Parameters
        ----------
        bw: array_like
            Vector of bandwidth value(s)
        endog: 1D array_like
            The dependent variable
        exog: 1D or 2D array_like
            The independent variable(s)
        data_predict: 1D array_like of length K, where K is
            the number of variables. The point at which
            the density is estimated

        Returns
        -------
        D_x: array_like
            The value of the conditional mean at data_predict

        Notes
        -----
        See p. 81 in [1] and p.38 in [2] for the formulas
        Unlike other methods, this one requires that data_predict be 1D

        """
        nobs, k_vars = exog.shape
        ker = gpke(bw, data=exog, data_predict=data_predict,
                   var_type=self.var_type,
                   ukertype='aitchison_aitken_reg',
                   okertype='wangryzin_reg', tosum=False)
        # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
        # See also p. 38 in [2]

        # Convert ker to a 2-D array to make matrix operations below work
        ker = W * ker[:, np.newaxis]

        M12 = exog - data_predict
        M22 = np.dot(M12.T, M12 * ker)
        M12 = (M12 * ker).sum(axis=0)
        M = np.empty((k_vars + 1, k_vars + 1))
        M[0, 0] = ker.sum()
        M[0, 1:] = M12
        M[1:, 0] = M12
        M[1:, 1:] = M22

        ker_endog = ker * endog
        V = np.empty((k_vars + 1, 1))
        V[0, 0] = ker_endog.sum()
        V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)

        mean_mfx = np.dot(np.linalg.pinv(M), V)
        mean = mean_mfx[0]
        mfx = mean_mfx[1:, :]
        return mean, mfx
Exemplo n.º 10
0
    def loo_likelihood(self, bw, func=lambda x: x):
        """
        Returns the leave-one-out conditional likelihood of the data.

        If `func` is not equal to the default, what's calculated is a function
        of the leave-one-out conditional likelihood.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Returns
        -------
        L: float
            The value of the leave-one-out function for the data.

        Notes
        -----
        Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(y)``
        for ``f(x)``.
        """
        yLOO = LeaveOneOut(self.data)
        xLOO = LeaveOneOut(self.exog).__iter__()
        L = 0
        for i, Y_j in enumerate(yLOO):
            X_not_i = xLOO.next()
            f_yx = gpke(bw,
                        data=-Y_j,
                        data_predict=-self.data[i, :],
                        var_type=(self.dep_type + self.indep_type))
            f_x = gpke(bw[self.k_dep:],
                       data=-X_not_i,
                       data_predict=-self.exog[i, :],
                       var_type=self.indep_type)
            f_i = f_yx / f_x
            L += func(f_i)

        return -L
Exemplo n.º 11
0
    def loo_likelihood(self, bw, func=lambda x: x):
        """
        Returns the leave-one-out conditional likelihood of the data.

        If `func` is not equal to the default, what's calculated is a function
        of the leave-one-out conditional likelihood.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Returns
        -------
        L: float
            The value of the leave-one-out function for the data.

        Notes
        -----
        Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(y)``
        for ``f(x)``.
        """
        yLOO = LeaveOneOut(self.data)
        xLOO = LeaveOneOut(self.exog).__iter__()
        L = 0
        for i, Y_j in enumerate(yLOO):
            X_not_i = xLOO.next()
            f_yx = gpke(bw, data=-Y_j, data_predict=-self.data[i, :],
                        var_type=(self.dep_type + self.indep_type))
            f_x = gpke(bw[self.k_dep:], data=-X_not_i,
                       data_predict=-self.exog[i, :],
                       var_type=self.indep_type)
            f_i = f_yx / f_x
            L += func(f_i)

        return -L
Exemplo n.º 12
0
    def cdf(self, data_predict=None):
        r"""
        Evaluate the cumulative distribution function.

        Parameters
        ----------
        data_predict: array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        cdf_est: array_like
            The estimate of the cdf.

        Notes
        -----
        See http://en.wikipedia.org/wiki/Cumulative_distribution_function
        For more details on the estimation see Ref. [5] in module docstring.

        The multivariate CDF for mixed data (continuous and ordered/unordered
        discrete) is estimated by:

        ..math:: F(x^{c},x^{d})=n^{-1}\sum_{i=1}^{n}\left[G(
            \frac{x^{c}-X_{i}}{h})\sum_{u\leq x^{d}}L(X_{i}^{d},x_{i}^{d},
            \lambda)\right]

        where G() is the product kernel CDF estimator for the continuous
        and L() for the discrete variables.

        Used bandwidth is ``self.bw``.
        """
        if data_predict is None:
            data_predict = self.data
        else:
            data_predict = _adjust_shape(data_predict, self.k_vars)

        cdf_est = []
        for i in xrange(np.shape(data_predict)[0]):
            cdf_est.append(
                gpke(self.bw,
                     data=self.data,
                     data_predict=data_predict[i, :],
                     var_type=self.var_type,
                     ckertype="gaussian_cdf",
                     ukertype="aitchisonaitken_cdf",
                     okertype='wangryzin_cdf') / self.nobs)

        cdf_est = np.squeeze(cdf_est)
        return cdf_est
Exemplo n.º 13
0
    def cdf(self, data_predict=None):
        r"""
        Evaluate the cumulative distribution function.

        Parameters
        ----------
        data_predict: array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        cdf_est: array_like
            The estimate of the cdf.

        Notes
        -----
        See http://en.wikipedia.org/wiki/Cumulative_distribution_function
        For more details on the estimation see Ref. [5] in module docstring.

        The multivariate CDF for mixed data (continuous and ordered/unordered
        discrete) is estimated by:

        ..math:: F(x^{c},x^{d})=n^{-1}\sum_{i=1}^{n}\left[G(
            \frac{x^{c}-X_{i}}{h})\sum_{u\leq x^{d}}L(X_{i}^{d},x_{i}^{d},
            \lambda)\right]

        where G() is the product kernel CDF estimator for the continuous
        and L() for the discrete variables.

        Used bandwidth is ``self.bw``.
        """
        if data_predict is None:
            data_predict = self.data
        else:
            data_predict = _adjust_shape(data_predict, self.k_vars)

        cdf_est = []
        for i in xrange(np.shape(data_predict)[0]):
            cdf_est.append(gpke(self.bw, data=self.data,
                                data_predict=data_predict[i, :],
                                var_type=self.var_type,
                                ckertype="gaussian_cdf",
                                ukertype="aitchisonaitken_cdf",
                                okertype='wangryzin_cdf') / self.nobs)

        cdf_est = np.squeeze(cdf_est)
        return cdf_est
Exemplo n.º 14
0
    def aic_hurvich(self, bw, func=None):
        """
        Computes the AIC Hurvich criteria for the estimation of the bandwidth.

        Parameters
        ----------
        bw : str or array_like
            See the ``bw`` parameter of `KernelReg` for details.

        Returns
        -------
        aic : ndarray
            The AIC Hurvich criteria, one element for each variable.
        func : None
            Unused here, needed in signature because it's used in `cv_loo`.

        References
        ----------
        See ch.2 in [1] and p.35 in [2].

        """
        H = np.empty((self.nobs, self.nobs))
        for j in range(self.nobs):
            H[:, j] = gpke(bw, data=self.exog, data_predict=self.exog[j,:],
                           var_type=self.var_type, tosum=False)

        denom = H.sum(axis=1)
        H = H / denom
        gx = KernelReg(endog=self.endog, exog=self.exog, var_type=self.var_type,
                       reg_type=self.reg_type, bw=bw,
                       defaults=EstimatorSettings(efficient=False)).fit()[0]
        gx = np.reshape(gx, (self.nobs, 1))
        sigma = ((self.endog - gx)**2).sum(axis=0) / float(self.nobs)

        frac = (1 + np.trace(H) / float(self.nobs)) / \
               (1 - (np.trace(H) + 2) / float(self.nobs))
        #siga = np.dot(self.endog.T, (I - H).T)
        #sigb = np.dot((I - H), self.endog)
        #sigma = np.dot(siga, sigb) / float(self.nobs)
        aic = np.log(sigma) + frac
        return aic
Exemplo n.º 15
0
    def imse(self, bw):
        r"""
        The integrated mean square error for the conditional KDE.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV: float
            The cross-validation objective function.

        Notes
        -----
        For more details see pp. 156-166 in [1].
        For details on how to handle the mixed variable types see [3].

        The formula for the cross-validation objective function for mixed
        variable types is:

        .. math:: CV(h,\lambda)=\frac{1}{n}\sum_{l=1}^{n}
            \frac{G_{-l}(X_{l})}{\left[\mu_{-l}(X_{l})\right]^{2}}-
            \frac{2}{n}\sum_{l=1}^{n}\frac{f_{-l}(X_{l},Y_{l})}{\mu_{-l}(X_{l})}

        where

        .. math:: G_{-l}(X_{l}) = n^{-2}\sum_{i\neq l}\sum_{j\neq l}
                        K_{X_{i},X_{l}} K_{X_{j},X_{l}}K_{Y_{i},Y_{j}}^{(2)}

        where :math:`K_{X_{i},X_{l}}` is the multivariate product kernel and
        :math:`\mu_{-l}(X_{l})` is the leave-one-out estimator of the pdf.

        :math:`K_{Y_{i},Y_{j}}^{(2)}` is the convolution kernel.

        The value of the function is minimized by the ``_cv_ls`` method of the
        `GenericKDE` class to return the bw estimates that minimize the
        distance between the estimated and "true" probability density.
        """
        zLOO = LeaveOneOut(self.data)
        CV = 0
        nobs = float(self.nobs)
        expander = np.ones((self.nobs - 1, 1))
        for ii, Z in enumerate(zLOO):
            X = Z[:, self.k_dep:]
            Y = Z[:, :self.k_dep]
            Ye_L = np.kron(Y, expander)
            Ye_R = np.kron(expander, Y)
            Xe_L = np.kron(X, expander)
            Xe_R = np.kron(expander, X)
            K_Xi_Xl = gpke(bw[self.k_dep:], data=Xe_L,
                           data_predict=self.exog[ii, :],
                           var_type=self.indep_type, tosum=False)
            K_Xj_Xl = gpke(bw[self.k_dep:], data=Xe_R,
                           data_predict=self.exog[ii, :],
                           var_type=self.indep_type, tosum=False)
            K2_Yi_Yj = gpke(bw[0:self.k_dep], data=Ye_L,
                            data_predict=Ye_R, var_type=self.dep_type,
                            ckertype='gauss_convolution',
                            okertype='wangryzin_convolution',
                            ukertype='aitchisonaitken_convolution',
                            tosum=False)
            G = (K_Xi_Xl * K_Xj_Xl * K2_Yi_Yj).sum() / nobs**2
            f_X_Y = gpke(bw, data=-Z, data_predict=-self.data[ii, :],
                         var_type=(self.dep_type + self.indep_type)) / nobs
            m_x = gpke(bw[self.k_dep:], data=-X,
                       data_predict=-self.exog[ii, :],
                       var_type=self.indep_type) / nobs
            CV += (G / m_x ** 2) - 2 * (f_X_Y / m_x)

        return CV / nobs
Exemplo n.º 16
0
    def cdf(self, endog_predict=None, exog_predict=None):
        r"""
        Cumulative distribution function for the conditional density.

        Parameters
        ----------
        endog_predict: array_like, optional
            The evaluation dependent variables at which the cdf is estimated.
            If not specified the training dependent variables are used.
        exog_predict: array_like, optional
            The evaluation independent variables at which the cdf is estimated.
            If not specified the training independent variables are used.

        Returns
        -------
        cdf_est: array_like
            The estimate of the cdf.

        Notes
        -----
        For more details on the estimation see [5], and p.181 in [1].

        The multivariate conditional CDF for mixed data (continuous and
        ordered/unordered discrete) is estimated by:

        ..math:: F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}})
                              W_{h}(X_{i},x)}{\widehat{\mu}(x)}

        where G() is the product kernel CDF estimator for the dependent (y)
        variable(s) and W() is the product kernel CDF estimator for the
        independent variable(s).
        """
        if endog_predict is None:
            endog_predict = self.endog
        else:
            endog_predict = _adjust_shape(endog_predict, self.k_dep)
        if exog_predict is None:
            exog_predict = self.exog
        else:
            exog_predict = _adjust_shape(exog_predict, self.k_indep)

        N_data_predict = np.shape(exog_predict)[0]
        cdf_est = np.empty(N_data_predict)
        for i in xrange(N_data_predict):
            mu_x = gpke(self.bw[self.k_dep:], data=self.exog,
                        data_predict=exog_predict[i, :],
                        var_type=self.indep_type) / self.nobs
            mu_x = np.squeeze(mu_x)
            cdf_endog = gpke(self.bw[0:self.k_dep], data=self.endog,
                             data_predict=endog_predict[i, :],
                             var_type=self.dep_type,
                             ckertype="gaussian_cdf",
                             ukertype="aitchisonaitken_cdf",
                             okertype='wangryzin_cdf', tosum=False)

            cdf_exog = gpke(self.bw[self.k_dep:], data=self.exog,
                            data_predict=exog_predict[i, :],
                            var_type=self.indep_type, tosum=False)
            S = (cdf_endog * cdf_exog).sum(axis=0)
            cdf_est[i] = S / (self.nobs * mu_x)

        return cdf_est
Exemplo n.º 17
0
    def cdf(self, endog_predict=None, exog_predict=None):
        r"""
        Cumulative distribution function for the conditional density.

        Parameters
        ----------
        endog_predict: array_like, optional
            The evaluation dependent variables at which the cdf is estimated.
            If not specified the training dependent variables are used.
        exog_predict: array_like, optional
            The evaluation independent variables at which the cdf is estimated.
            If not specified the training independent variables are used.

        Returns
        -------
        cdf_est: array_like
            The estimate of the cdf.

        Notes
        -----
        For more details on the estimation see [5], and p.181 in [1].

        The multivariate conditional CDF for mixed data (continuous and
        ordered/unordered discrete) is estimated by:

        ..math:: F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}})
                              W_{h}(X_{i},x)}{\widehat{\mu}(x)}

        where G() is the product kernel CDF estimator for the dependent (y)
        variable(s) and W() is the product kernel CDF estimator for the
        independent variable(s).
        """
        if endog_predict is None:
            endog_predict = self.endog
        else:
            endog_predict = _adjust_shape(endog_predict, self.k_dep)
        if exog_predict is None:
            exog_predict = self.exog
        else:
            exog_predict = _adjust_shape(exog_predict, self.k_indep)

        N_data_predict = np.shape(exog_predict)[0]
        cdf_est = np.empty(N_data_predict)
        for i in xrange(N_data_predict):
            mu_x = gpke(self.bw[self.k_dep:], data=self.exog,
                        data_predict=exog_predict[i, :],
                        var_type=self.indep_type) / self.nobs
            mu_x = np.squeeze(mu_x)
            cdf_endog = gpke(self.bw[0:self.k_dep], data=self.endog,
                             data_predict=endog_predict[i, :],
                             var_type=self.dep_type,
                             ckertype="gaussian_cdf",
                             ukertype="aitchisonaitken_cdf",
                             okertype='wangryzin_cdf', tosum=False)

            cdf_exog = gpke(self.bw[self.k_dep:], data=self.exog,
                            data_predict=exog_predict[i, :],
                            var_type=self.indep_type, tosum=False)
            S = (cdf_endog * cdf_exog).sum(axis=0)
            cdf_est[i] = S / (self.nobs * mu_x)

        return cdf_est
Exemplo n.º 18
0
    def imse(self, bw):
        r"""
        The integrated mean square error for the conditional KDE.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV: float
            The cross-validation objective function.

        Notes
        -----
        For more details see pp. 156-166 in [1].
        For details on how to handle the mixed variable types see [3].

        The formula for the cross-validation objective function for mixed
        variable types is:

        .. math:: CV(h,\lambda)=\frac{1}{n}\sum_{l=1}^{n}
            \frac{G_{-l}(X_{l})}{\left[\mu_{-l}(X_{l})\right]^{2}}-
            \frac{2}{n}\sum_{l=1}^{n}\frac{f_{-l}(X_{l},Y_{l})}{\mu_{-l}(X_{l})}

        where

        .. math:: G_{-l}(X_{l}) = n^{-2}\sum_{i\neq l}\sum_{j\neq l}
                        K_{X_{i},X_{l}} K_{X_{j},X_{l}}K_{Y_{i},Y_{j}}^{(2)}

        where :math:`K_{X_{i},X_{l}}` is the multivariate product kernel and
        :math:`\mu_{-l}(X_{l})` is the leave-one-out estimator of the pdf.

        :math:`K_{Y_{i},Y_{j}}^{(2)}` is the convolution kernel.

        The value of the function is minimized by the ``_cv_ls`` method of the
        `GenericKDE` class to return the bw estimates that minimize the
        distance between the estimated and "true" probability density.
        """
        zLOO = LeaveOneOut(self.data)
        CV = 0
        nobs = float(self.nobs)
        expander = np.ones((self.nobs - 1, 1))
        for ii, Z in enumerate(zLOO):
            X = Z[:, self.k_dep:]
            Y = Z[:, :self.k_dep]
            Ye_L = np.kron(Y, expander)
            Ye_R = np.kron(expander, Y)
            Xe_L = np.kron(X, expander)
            Xe_R = np.kron(expander, X)
            K_Xi_Xl = gpke(bw[self.k_dep:], data=Xe_L,
                           data_predict=self.exog[ii, :],
                           var_type=self.indep_type, tosum=False)
            K_Xj_Xl = gpke(bw[self.k_dep:], data=Xe_R,
                           data_predict=self.exog[ii, :],
                           var_type=self.indep_type, tosum=False)
            K2_Yi_Yj = gpke(bw[0:self.k_dep], data=Ye_L,
                            data_predict=Ye_R, var_type=self.dep_type,
                            ckertype='gauss_convolution',
                            okertype='wangryzin_convolution',
                            ukertype='aitchisonaitken_convolution',
                            tosum=False)
            G = (K_Xi_Xl * K_Xj_Xl * K2_Yi_Yj).sum() / nobs**2
            f_X_Y = gpke(bw, data=-Z, data_predict=-self.data[ii, :],
                         var_type=(self.dep_type + self.indep_type)) / nobs
            m_x = gpke(bw[self.k_dep:], data=-X,
                       data_predict=-self.exog[ii, :],
                       var_type=self.indep_type) / nobs
            CV += (G / m_x ** 2) - 2 * (f_X_Y / m_x)

        return CV / nobs