def adjust(self, node): """ sample from posterior """ D = np.vstack([d.pos for d in node.data ]) old=np.array([node.get_likelihood_angle(d) for d in node.data ]) old=np.log(old).mean() #import ipdb as pdb; pdb.set_trace() D -= node.parent.pos D = D.T old_angle = node.angle # determine posterior of angle given prior and data x_ang = np.arctan2(D[1,:],D[0,:]) R1 = self.angle_kappa * np.cos(node.parent.angle) + np.sum(np.cos(x_ang)) R2 = self.angle_kappa * np.sin(node.parent.angle) + np.sum(np.sin(x_ang)) #R1 = np.sum(np.cos(x_ang)) #R2 = np.sum(np.sin(x_ang)) mu = np.arctan2(R2,R1) Rn = R1 / np.cos(mu) node.angle = vonmises.rvs(Rn,loc=mu) #node.angle = mu X = D.copy() # rotate around newly drawn angle D = np.dot(rotmat(-node.angle), D) assert D.shape[1] == len(node.data) # determine posterior of length given prior and data aN = self.length_a0 + D.shape[1]/2.0 bN = self.length_b0 + D.shape[1]/2.0 * np.var(D[0,:]) #aN = D.shape[1]/2.0 #bN = D.shape[1]/2.0 * np.var(D[0,:]) #import ipdb; ipdb.set_trace() node.length = gamma.rvs(aN,loc=1.0/bN) #node.length = np.var(D[0,:]) old_pos = node.pos.copy() node.update_position() def f(): plt.plot(X[0,:],X[1,:],".b") #plt.plot(D[0,:],D[1,:],".r") plt.plot(node.pos[0]-node.parent.pos[0],node.pos[1]-node.parent.pos[1],"*r") plt.plot(old_pos[0]-node.parent.pos[0],old_pos[1]-node.parent.pos[1],"*b") #plt.plot(node.parent.pos[0],node.parent.pos[1],"*r") plt.show() new=np.array([node.get_likelihood_angle(d) for d in node.data ]) new=np.log(new).mean() print('old likelihood: %f new likelihood: %f'%(old,new))
def infer_sample_rate(series): seriesNone = series + [] series = series + [] # we need a fresh copy! for i,s in enumerate(series): if seriesNone[i] == None: series[i] = 0 if series[i] == 0: series[i] = 0.001 rates = list(gamma.rvs(series, 1)) for i,r in enumerate(rates): if seriesNone[i] == None or seriesNone[i] == 0: rates[i] = None return rates
def _initialise_posterior(self, data): D = self.basis.get_dim(data[0]) # Intialise weights and covariances res = sgd(self._map, self.__random.randn(D), data, maxiter=self.maxiter, updater=self.updater, batch_size=self.batch_size, random_state=self.randstate) # Initialise each posterior component randomly around the MAP weights self.covariance = gamma.rvs(2, scale=0.5, size=(D, self.K)) self.weights = res.x[:, np.newaxis] + \ np.sqrt(self.covariance) * self.__random.rand(D, self.K) self.weights[:, 0] = res.x # Make sure we include the MAP weights too
def learn(X, y, likelihood, lparams, basis, bparams, regulariser=1., postcomp=10, use_sgd=True, maxit=1000, tol=1e-7, batchsize=100, rate=0.9, eta=1e-5, verbose=True): """ Learn the parameters of a Bayesian generalised linear model (GLM). The learning algorithm uses nonparametric variational inference [1]_, and optionally stochastic gradients. Parameters ---------- X: ndarray (N, d) array input dataset (N samples, d dimensions). y: ndarray (N,) array targets (N samples) likelihood: Object A likelihood object, see the likelihoods module. lparams: sequence a sequence of parameters for the likelihood object, e.g. the likelihoods.Gaussian object takes a variance parameter, so this should be :code:`[var]`. basis: Basis A basis object, see the basis_functions module. bparams: sequence A sequence of parameters of the basis object. regulariser: float, optional weight regulariser (variance) initial value. postcomp: int, optional Number of diagonal Gaussian components to use to approximate the posterior distribution. tol: float, optional Optimiser relative tolerance convergence criterion. use_sgd: bool, optional If :code:`True` then use SGD (Adadelta) optimisation instead of L-BFGS. maxit: int, optional Maximum number of iterations of the optimiser to run. If :code:`use_sgd` is :code:`True` then this is the number of complete passes through the data before optimization terminates (unless it converges first). batchsize: int, optional number of observations to use per SGD batch. Ignored if :code:`use_sgd=False`. rate: float, optional SGD decay rate, must be [0, 1]. Ignored if :code:`use_sgd=False`. eta: float, optional Jitter term for adadelta SGD. Ignored if :code:`use_sgd=False`. verbose: bool, optional log the learning status. Returns ------- m: ndarray (D, postcomp) array of posterior weight means (D is the dimension of the features). C: ndarray (D, postcomp) array of posterior weight variances. lparams: sequence learned sequence of likelihood object hyperparameters. bparams: sequence learned sequence of basis object hyperparameters. Notes ----- This approximates the posterior distribution over the weights with a mixture of Gaussians: .. math :: \mathbf{w} \sim \\frac{1}{K} \sum^K_{k=1} \mathcal{N}(\mathbf{m_k}, \\boldsymbol{\Psi}_k) where, .. math :: \\boldsymbol{\Psi}_k = \\text{diag}([\Psi_{k,1}, \ldots, \Psi_{k,D}]). This is so arbitrary likelihoods can be used with this algorithm, while still mainting flexible and tractable non-Gaussian posteriors. Additionaly this has the benefit that we have a reduced number of parameters to optimise (compared with full covariance Gaussians). The main differences between this implementation and the GLM in [1]_ are: - We use diagonal mixtures, as opposed to isotropic. - We do not cycle between optimising eq. 10 and 11 (objectives L1 and L2) in the paper. We use the full objective L2 for everything, including the posterior means, and we optimise all parameters together. Even though these changes make learning a little slower, and require third derivatives of the likelihoods, we obtain better results and we can use SGD straight-forwardly. """ N, d = X.shape D = basis(np.atleast_2d(X[0, :]), *bparams).shape[1] K = postcomp # Pre-allocate here dm = np.zeros((D, K)) dC = np.zeros((D, K)) H = np.empty((D, K)) # Objective function Eq. 10 from [1], and gradients of ALL params def L2(_m, _C, _reg, _lparams, *args): # Extract data, parameters, etc _bparams, y, X = args[:-1], args[-1][:, 0], args[-1][:, 1:] # Dimensions M, d = X.shape D, K = _m.shape B = N / M # Basis function stuff Phi = basis(X, *_bparams) # M x D Phi2 = Phi**2 Phi3 = Phi**3 f = Phi.dot(_m) # M x K df, d2f, d3f = np.zeros((M, K)), np.zeros((M, K)), np.zeros((M, K)) # Posterior responsability terms logqkk = _qmatrix(_m, _C) logqk = logsumexp(logqkk, axis=0) # log term of Eq. 7 from [1] pz = np.exp(logqkk - logqk) # Big loop though posterior mixtures for calculating stuff ll = 0 dlp = [np.zeros_like(p) for p in _lparams] for k in range(K): # Common likelihood calculations ll += B * likelihood.loglike(y, f[:, k], *_lparams).sum() df[:, k] = B * likelihood.df(y, f[:, k], *_lparams) d2f[:, k] = B * likelihood.d2f(y, f[:, k], *_lparams) d3f[:, k] = B * likelihood.d3f(y, f[:, k], *_lparams) H[:, k] = d2f[:, k].dot(Phi2) - 1. / _reg # Posterior mean and covariance gradients mkmj = _m[:, k][:, np.newaxis] - _m iCkCj = 1 / (_C[:, k][:, np.newaxis] + _C) dC[:, k] = (-((mkmj * iCkCj)**2 - 2 * iCkCj).dot(pz[:, k]) + H[:, k]) / (2 * K) dm[:, k] = (df[:, k].dot(Phi) + 0.5 * _C[:, k] * d3f[:, k].dot(Phi3) + (iCkCj * mkmj).dot(pz[:, k]) - _m[:, k] / _reg) / K # Likelihood parameter gradients dp = likelihood.dp(y, f[:, k], *_lparams) dp2df = likelihood.dpd2f(y, f[:, k], *_lparams) for l in range(len(_lparams)): dpH = dp2df[l].dot(Phi2) dlp[l] -= B * (dp[l].sum() + 0.5 * (_C[:, k] * dpH).sum()) / K # Regulariser gradient dreg = (((_m**2).sum() + _C.sum()) / _reg**2 - D * K / _reg) / (2 * K) # Basis function parameter gradients def dtheta(dPhi): dt = 0 dPhiPhi = dPhi * Phi for k in range(K): dPhimk = dPhi.dot(_m[:, k]) dPhiH = d2f[:, k].dot(dPhiPhi) + \ 0.5 * (d3f[:, k] * dPhimk).dot(Phi2) dt -= (df[:, k].dot(dPhimk) + (_C[:, k] * dPhiH).sum()) / K return dt dbp = apply_grad(dtheta, basis.grad(X, *_bparams)) # Objective, Eq. 10 in [1] L2 = 1. / K * (ll - 0.5 * D * K * np.log(2 * np.pi * _reg) - 0.5 * (_m**2).sum() / _reg + 0.5 * (_C * H).sum() - logqk.sum() + np.log(K)) if verbose: log.info("L2 = {}, reg = {}, lparams = {}, bparams = {}" .format(L2, _reg, _lparams, _bparams)) return -L2, append_or_extend([-dm, -dC, -dreg, dlp], dbp) # Intialise m and C m = np.random.randn(D, K) + np.arange(K) - K / 2 C = gamma.rvs(2, scale=0.5, size=(D, K)) bounds = [Bound(shape=m.shape), Positive(shape=C.shape), Positive(), likelihood.bounds] append_or_extend(bounds, basis.bounds) vparams = [m, C, regulariser, lparams] + bparams if use_sgd is False: nmin = structured_minimizer(logtrick_minimizer(minimize)) res = nmin(L2, vparams, ftol=tol, maxiter=maxit, method='L-BFGS-B', jac=True, bounds=bounds, args=(np.hstack((y[:, np.newaxis], X)),)) else: nsgd = structured_sgd(logtrick_sgd(sgd)) res = nsgd(L2, vparams, np.hstack((y[:, np.newaxis], X)), rate=rate, eta=eta, bounds=bounds, gtol=tol, passes=maxit, batchsize=batchsize, eval_obj=True) (m, C, regulariser, lparams), bparams = res.x[:4], res.x[4:] if verbose: log.info("Finished! Objective = {}, reg = {}, lparams = {}, " "bparams = {}, message: {}." .format(-res.fun, regulariser, lparams, bparams, res.message)) return m, C, lparams, bparams