def loo_likelihood(self, bw, func=lambda x: x): r""" Returns the leave-one-out likelihood function. The leave-one-out likelihood function for the unconditional KDE. Parameters ---------- bw: array_like The value for the bandwidth parameter(s). func: callable, optional Function to transform the likelihood values (before summing); for the log likelihood, use ``func=np.log``. Default is ``f(x) = x``. Notes ----- The leave-one-out kernel estimator of :math:`f_{-i}` is: .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h} \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j}) where :math:`K_{h}` represents the generalized product kernel estimator: .. math:: K_{h}(X_{i},X_{j}) = \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right) """ LOO = LeaveOneOut(self.data) L = 0 for i, X_not_i in enumerate(LOO): f_i = gpke(bw, data=-X_not_i, data_predict=-self.data[i, :], var_type=self.var_type) L += func(f_i) return -L
def cv_loo(self, bw, func): """ The cross-validation function with leave-one-out estimator Parameters ---------- bw: array_like Vector of bandwidth values func: callable function Returns the estimator of g(x). Can be either ``_est_loc_constant`` (local constant) or ``_est_loc_linear`` (local_linear). Returns ------- L: float The value of the CV function Notes ----- Calculates the cross-validation least-squares function. This function is minimized by compute_bw to calculate the optimal value of bw For details see p.35 in [2] ..math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2} where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X) and :math:`h` is the vector of bandwidths """ LOO_X = LeaveOneOut(self.exog) LOO_Y = LeaveOneOut(self.endog).__iter__() LOO_W = LeaveOneOut(self.W_in).__iter__() L = 0 for ii, X_not_i in enumerate(LOO_X): Y = LOO_Y.next() w = LOO_W.next() G = func(bw, endog=Y, exog=-X_not_i, data_predict=-self.exog[ii, :], W=w)[0] L += (self.endog[ii] - G) ** 2 # Note: There might be a way to vectorize this. See p.72 in [1] return L / self.nobs
def loo_likelihood(self, bw, func=lambda x: x): """ Returns the leave-one-out conditional likelihood of the data. If `func` is not equal to the default, what's calculated is a function of the leave-one-out conditional likelihood. Parameters ---------- bw: array_like The bandwidth parameter(s). func: callable, optional Function to transform the likelihood values (before summing); for the log likelihood, use ``func=np.log``. Default is ``f(x) = x``. Returns ------- L: float The value of the leave-one-out function for the data. Notes ----- Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(y)`` for ``f(x)``. """ yLOO = LeaveOneOut(self.data) xLOO = LeaveOneOut(self.exog).__iter__() L = 0 for i, Y_j in enumerate(yLOO): X_not_i = xLOO.next() f_yx = gpke(bw, data=-Y_j, data_predict=-self.data[i, :], var_type=(self.dep_type + self.indep_type)) f_x = gpke(bw[self.k_dep:], data=-X_not_i, data_predict=-self.exog[i, :], var_type=self.indep_type) f_i = f_yx / f_x L += func(f_i) return -L
def imse(self, bw): r""" The integrated mean square error for the conditional KDE. Parameters ---------- bw: array_like The bandwidth parameter(s). Returns ------- CV: float The cross-validation objective function. Notes ----- For more details see pp. 156-166 in [1]. For details on how to handle the mixed variable types see [3]. The formula for the cross-validation objective function for mixed variable types is: .. math:: CV(h,\lambda)=\frac{1}{n}\sum_{l=1}^{n} \frac{G_{-l}(X_{l})}{\left[\mu_{-l}(X_{l})\right]^{2}}- \frac{2}{n}\sum_{l=1}^{n}\frac{f_{-l}(X_{l},Y_{l})}{\mu_{-l}(X_{l})} where .. math:: G_{-l}(X_{l}) = n^{-2}\sum_{i\neq l}\sum_{j\neq l} K_{X_{i},X_{l}} K_{X_{j},X_{l}}K_{Y_{i},Y_{j}}^{(2)} where :math:`K_{X_{i},X_{l}}` is the multivariate product kernel and :math:`\mu_{-l}(X_{l})` is the leave-one-out estimator of the pdf. :math:`K_{Y_{i},Y_{j}}^{(2)}` is the convolution kernel. The value of the function is minimized by the ``_cv_ls`` method of the `GenericKDE` class to return the bw estimates that minimize the distance between the estimated and "true" probability density. """ zLOO = LeaveOneOut(self.data) CV = 0 nobs = float(self.nobs) expander = np.ones((self.nobs - 1, 1)) for ii, Z in enumerate(zLOO): X = Z[:, self.k_dep:] Y = Z[:, :self.k_dep] Ye_L = np.kron(Y, expander) Ye_R = np.kron(expander, Y) Xe_L = np.kron(X, expander) Xe_R = np.kron(expander, X) K_Xi_Xl = gpke(bw[self.k_dep:], data=Xe_L, data_predict=self.exog[ii, :], var_type=self.indep_type, tosum=False) K_Xj_Xl = gpke(bw[self.k_dep:], data=Xe_R, data_predict=self.exog[ii, :], var_type=self.indep_type, tosum=False) K2_Yi_Yj = gpke(bw[0:self.k_dep], data=Ye_L, data_predict=Ye_R, var_type=self.dep_type, ckertype='gauss_convolution', okertype='wangryzin_convolution', ukertype='aitchisonaitken_convolution', tosum=False) G = (K_Xi_Xl * K_Xj_Xl * K2_Yi_Yj).sum() / nobs**2 f_X_Y = gpke(bw, data=-Z, data_predict=-self.data[ii, :], var_type=(self.dep_type + self.indep_type)) / nobs m_x = gpke(bw[self.k_dep:], data=-X, data_predict=-self.exog[ii, :], var_type=self.indep_type) / nobs CV += (G / m_x ** 2) - 2 * (f_X_Y / m_x) return CV / nobs
def imse(self, bw): r""" Returns the Integrated Mean Square Error for the unconditional KDE. Parameters ---------- bw: array_like The bandwidth parameter(s). Returns ------ CV: float The cross-validation objective function. Notes ----- See p. 27 in [1] For details on how to handle the multivariate estimation with mixed data types see p.6 in [3] The formula for the cross-validation objective function is: .. math:: CV=\frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N} \bar{K}_{h}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n} \sum_{j=1,j\neq i}^{N}K_{h}(X_{i},X_{j}) Where :math:`\bar{K}_{h}` is the multivariate product convolution kernel (consult [3] for mixed data types). """ #F = 0 #for i in range(self.nobs): # k_bar_sum = gpke(bw, data=-self.data, # data_predict=-self.data[i, :], # var_type=self.var_type, # ckertype='gauss_convolution', # okertype='wangryzin_convolution', # ukertype='aitchisonaitken_convolution') # F += k_bar_sum ## there is a + because loo_likelihood returns the negative #return (F / self.nobs**2 + self.loo_likelihood(bw) * \ # 2 / ((self.nobs) * (self.nobs - 1))) # The code below is equivalent to the commented-out code above. It's # about 20% faster due to some code being moved outside the for-loops # and shared by gpke() and loo_likelihood(). F = 0 kertypes = dict(c=kernels.gaussian_convolution, o=kernels.wang_ryzin_convolution, u=kernels.aitchison_aitken_convolution) nobs = self.nobs data = -self.data var_type = self.var_type ix_cont = np.array([c == 'c' for c in var_type]) _bw_cont_product = bw[ix_cont].prod() Kval = np.empty(data.shape) for i in range(nobs): for ii, vtype in enumerate(var_type): Kval[:, ii] = kertypes[vtype](bw[ii], data[:, ii], data[i, ii]) dens = Kval.prod(axis=1) / _bw_cont_product k_bar_sum = dens.sum(axis=0) F += k_bar_sum # sum of prod kernel over nobs kertypes = dict(c=kernels.gaussian, o=kernels.wang_ryzin, u=kernels.aitchison_aitken) LOO = LeaveOneOut(self.data) L = 0 # leave-one-out likelihood Kval = np.empty((data.shape[0]-1, data.shape[1])) for i, X_not_i in enumerate(LOO): for ii, vtype in enumerate(var_type): Kval[:, ii] = kertypes[vtype](bw[ii], -X_not_i[:, ii], data[i, ii]) dens = Kval.prod(axis=1) / _bw_cont_product L += dens.sum(axis=0) # CV objective function, eq. (2.4) of Ref. [3] return (F / nobs**2 - 2 * L / (nobs * (nobs - 1)))