def test_sample_distribution(self): # check sample_distribution's ability to sample from a # fixed beta distribution nclasses = 8 nitems = 1000 # pick random parameters a = np.random.uniform(1.0, 5.0, size=(nclasses,)) b = np.random.uniform(4.0, 6.0, size=(nclasses,)) # sample values from a beta distribution with fixed parameters values = np.empty((nitems, nclasses)) for k in range(nclasses): values[:, k] = scipy.stats.beta.rvs(a[k], b[k], size=nitems) arguments = values def beta_likelihood(params, values): a = params[:nclasses].copy() b = params[nclasses:].copy() llhood = 0.0 for k in range(nclasses): llhood += scipy.stats.beta._logpdf(values[:, k], a[k], b[k]).sum() return llhood x_lower = np.zeros((nclasses * 2,)) + 0.5 x_upper = np.zeros((nclasses * 2,)) + 8.0 x0 = np.random.uniform(1.0, 7.5, size=(nclasses * 2,)) dx = optimize_step_size(beta_likelihood, x0.copy(), arguments, x_lower, x_upper, 1000, 100, 0.3, 0.1) njumps = 3000 samples = sample_distribution(beta_likelihood, x0.copy(), arguments, dx, njumps, x_lower, x_upper) samples = samples[100:] z = np.absolute((samples.mean(0) - np.r_[a, b]) / samples.std(0)) testing.assert_array_less(z, 3.0)
def test_sample_distribution(self): # check sample_distribution's ability to sample from a # fixed beta distribution nclasses = 8 nitems = 1000 # pick random parameters a = np.random.uniform(1., 5., size=(nclasses, )) b = np.random.uniform(4., 6., size=(nclasses, )) # sample values from a beta distribution with fixed parameters values = np.empty((nitems, nclasses)) for k in range(nclasses): values[:, k] = scipy.stats.beta.rvs(a[k], b[k], size=nitems) arguments = values def beta_likelihood(params, values): a = params[:nclasses].copy() b = params[nclasses:].copy() llhood = 0. for k in range(nclasses): llhood += scipy.stats.beta._logpdf(values[:, k], a[k], b[k]).sum() return llhood x_lower = np.zeros((nclasses * 2, )) + 0.5 x_upper = np.zeros((nclasses * 2, )) + 8. x0 = np.random.uniform(1., 7.5, size=(nclasses * 2, )) dx = optimize_step_size(beta_likelihood, x0.copy(), arguments, x_lower, x_upper, 1000, 100, 0.3, 0.1) njumps = 3000 samples = sample_distribution(beta_likelihood, x0.copy(), arguments, dx, njumps, x_lower, x_upper) samples = samples[100:] z = np.absolute((samples.mean(0) - np.r_[a, b]) / samples.std(0)) testing.assert_array_less(z, 3.)
def sample_posterior_over_accuracy(self, annotations, nsamples, burn_in_samples=100, thin_samples=5, target_rejection_rate=0.3, rejection_rate_tolerance=0.2, step_optimization_nsamples=500, adjust_step_every=100): """Return samples from posterior distribution over theta given data. Samples are drawn using a variant of a Metropolis-Hasting Markov Chain Monte Carlo (MCMC) algorithm. Sampling proceeds in two phases: 1) *step size estimation phase*: first, the step size in the MCMC algorithm is adjusted to achieve a given rejection rate. 2) *sampling phase*: second, samples are collected using the step size from phase 1. Arguments ---------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i nsamples : int Number of samples to return (i.e., burn-in and thinning samples are not included) burn_in_samples : int Discard the first `burn_in_samples` during the initial burn-in phase, where the Monte Carlo chain converges to the posterior thin_samples : int Only return one every `thin_samples` samples in order to reduce the auto-correlation in the sampling chain. This is called "thinning" in MCMC parlance. target_rejection_rate : float target rejection rate for the step size estimation phase rejection_rate_tolerance : float the step size estimation phase is ended when the rejection rate for all parameters is within `rejection_rate_tolerance` from `target_rejection_rate` step_optimization_nsamples : int number of samples to draw in the step size estimation phase adjust_step_every : int number of samples after which the step size is adjusted during the step size estimation pahse Returns ------- samples : ndarray, shape = (n_samples, n_annotators) samples[i,:] is one sample from the posterior distribution over the parameters `theta` """ self._raise_if_incompatible(annotations) nsamples = self._compute_total_nsamples(nsamples, burn_in_samples, thin_samples) # optimize step size counts = compute_counts(annotations, self.nclasses) # wrap log likelihood function to give it to optimize_step_size and # sample_distribution _llhood_counts = self._log_likelihood_counts _log_prior = self._log_prior def _wrap_llhood(params, counts): self.theta = params return _llhood_counts(counts) + _log_prior() # TODO this save-reset is rather ugly, refactor: create copy of # model and sample over it # save internal parameters to reset at the end of sampling save_params = (self.gamma, self.theta) try: # compute optimal step size for given target rejection rate params_start = self.theta.copy() params_upper = np.ones((self.nannotators, )) params_lower = np.zeros((self.nannotators, )) step = optimize_step_size(_wrap_llhood, params_start, counts, params_lower, params_upper, step_optimization_nsamples, adjust_step_every, target_rejection_rate, rejection_rate_tolerance) # draw samples from posterior distribution over theta samples = sample_distribution(_wrap_llhood, params_start, counts, step, nsamples, params_lower, params_upper) return self._post_process_samples(samples, burn_in_samples, thin_samples) finally: # reset parameters self.gamma, self.theta = save_params
def sample_posterior_over_accuracy(self, annotations, nsamples, burn_in_samples = 100, thin_samples = 5, target_rejection_rate = 0.3, rejection_rate_tolerance = 0.2, step_optimization_nsamples = 500, adjust_step_every = 100): """Return samples from posterior distribution over theta given data. Samples are drawn using a variant of a Metropolis-Hasting Markov Chain Monte Carlo (MCMC) algorithm. Sampling proceeds in two phases: 1) *step size estimation phase*: first, the step size in the MCMC algorithm is adjusted to achieve a given rejection rate. 2) *sampling phase*: second, samples are collected using the step size from phase 1. Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i nsamples : int number of samples to draw from the posterior burn_in_samples : int Discard the first `burn_in_samples` during the initial burn-in phase, where the Monte Carlo chain converges to the posterior thin_samples : int Only return one every `thin_samples` samples in order to reduce the auto-correlation in the sampling chain. This is called "thinning" in MCMC parlance. target_rejection_rate : float target rejection rate for the step size estimation phase rejection_rate_tolerance : float the step size estimation phase is ended when the rejection rate for all parameters is within `rejection_rate_tolerance` from `target_rejection_rate` step_optimization_nsamples : int number of samples to draw in the step size estimation phase adjust_step_every : int number of samples after which the step size is adjusted during the step size estimation pahse Returns ------- samples : ndarray, shape = (n_samples, n_annotators) samples[i,:] is one sample from the posterior distribution over the parameters `theta` """ self._raise_if_incompatible(annotations) nsamples = self._compute_total_nsamples(nsamples, burn_in_samples, thin_samples) # optimize step size counts = compute_counts(annotations, self.nclasses) # wrap log likelihood function to give it to optimize_step_size and # sample_distribution _llhood_counts = self._log_likelihood_counts _log_prior = self._log_prior def _wrap_llhood(params, counts): self.theta = params return _llhood_counts(counts) + _log_prior() # TODO this save-reset is rather ugly, refactor: create copy of # model and sample over it # save internal parameters to reset at the end of sampling save_params = self.theta try: # compute optimal step size for given target rejection rate params_start = self.theta.copy() params_upper = np.ones((self.nannotators,)) params_lower = np.zeros((self.nannotators,)) step = optimize_step_size(_wrap_llhood, params_start, counts, params_lower, params_upper, step_optimization_nsamples, adjust_step_every, target_rejection_rate, rejection_rate_tolerance) # draw samples from posterior distribution over theta samples = sample_distribution(_wrap_llhood, params_start, counts, step, nsamples, params_lower, params_upper) return self._post_process_samples(samples, burn_in_samples, thin_samples) finally: # reset parameters self.theta = save_params