Exemplo n.º 1
0
    def test_sample_distribution(self):
        # check sample_distribution's ability to sample from a
        # fixed beta distribution
        nclasses = 8
        nitems = 1000
        # pick random parameters
        a = np.random.uniform(1.0, 5.0, size=(nclasses,))
        b = np.random.uniform(4.0, 6.0, size=(nclasses,))

        # sample values from a beta distribution with fixed parameters
        values = np.empty((nitems, nclasses))
        for k in range(nclasses):
            values[:, k] = scipy.stats.beta.rvs(a[k], b[k], size=nitems)
        arguments = values

        def beta_likelihood(params, values):
            a = params[:nclasses].copy()
            b = params[nclasses:].copy()
            llhood = 0.0
            for k in range(nclasses):
                llhood += scipy.stats.beta._logpdf(values[:, k], a[k], b[k]).sum()
            return llhood

        x_lower = np.zeros((nclasses * 2,)) + 0.5
        x_upper = np.zeros((nclasses * 2,)) + 8.0
        x0 = np.random.uniform(1.0, 7.5, size=(nclasses * 2,))

        dx = optimize_step_size(beta_likelihood, x0.copy(), arguments, x_lower, x_upper, 1000, 100, 0.3, 0.1)

        njumps = 3000
        samples = sample_distribution(beta_likelihood, x0.copy(), arguments, dx, njumps, x_lower, x_upper)
        samples = samples[100:]

        z = np.absolute((samples.mean(0) - np.r_[a, b]) / samples.std(0))
        testing.assert_array_less(z, 3.0)
Exemplo n.º 2
0
    def test_sample_distribution(self):
        # check sample_distribution's ability to sample from a
        # fixed beta distribution
        nclasses = 8
        nitems = 1000
        # pick random parameters
        a = np.random.uniform(1., 5., size=(nclasses, ))
        b = np.random.uniform(4., 6., size=(nclasses, ))

        # sample values from a beta distribution with fixed parameters
        values = np.empty((nitems, nclasses))
        for k in range(nclasses):
            values[:, k] = scipy.stats.beta.rvs(a[k], b[k], size=nitems)
        arguments = values

        def beta_likelihood(params, values):
            a = params[:nclasses].copy()
            b = params[nclasses:].copy()
            llhood = 0.
            for k in range(nclasses):
                llhood += scipy.stats.beta._logpdf(values[:, k], a[k],
                                                   b[k]).sum()
            return llhood

        x_lower = np.zeros((nclasses * 2, )) + 0.5
        x_upper = np.zeros((nclasses * 2, )) + 8.
        x0 = np.random.uniform(1., 7.5, size=(nclasses * 2, ))

        dx = optimize_step_size(beta_likelihood, x0.copy(), arguments, x_lower,
                                x_upper, 1000, 100, 0.3, 0.1)

        njumps = 3000
        samples = sample_distribution(beta_likelihood, x0.copy(), arguments,
                                      dx, njumps, x_lower, x_upper)
        samples = samples[100:]

        z = np.absolute((samples.mean(0) - np.r_[a, b]) / samples.std(0))
        testing.assert_array_less(z, 3.)
Exemplo n.º 3
0
    def sample_posterior_over_accuracy(self,
                                       annotations,
                                       nsamples,
                                       burn_in_samples=100,
                                       thin_samples=5,
                                       target_rejection_rate=0.3,
                                       rejection_rate_tolerance=0.2,
                                       step_optimization_nsamples=500,
                                       adjust_step_every=100):
        """Return samples from posterior distribution over theta given data.

        Samples are drawn using a variant of a Metropolis-Hasting Markov Chain
        Monte Carlo (MCMC) algorithm. Sampling proceeds in two phases:

            1) *step size estimation phase*: first, the step size in the
               MCMC algorithm is adjusted to achieve a given rejection rate.

            2) *sampling phase*: second, samples are collected using the
               step size from phase 1.

        Arguments
        ----------
        annotations : ndarray, shape = (n_items, n_annotators)
            annotations[i,j] is the annotation of annotator j for item i

        nsamples : int
            Number of samples to return (i.e., burn-in and thinning samples
            are not included)

        burn_in_samples : int
            Discard the first `burn_in_samples` during the initial burn-in
            phase, where the Monte Carlo chain converges to the posterior

        thin_samples : int
            Only return one every `thin_samples` samples in order to reduce
            the auto-correlation in the sampling chain. This is called
            "thinning" in MCMC parlance.

        target_rejection_rate : float
            target rejection rate for the step size estimation phase

        rejection_rate_tolerance : float
            the step size estimation phase is ended when the rejection rate for
            all parameters is within `rejection_rate_tolerance` from
            `target_rejection_rate`

        step_optimization_nsamples : int
            number of samples to draw in the step size estimation phase

        adjust_step_every : int
            number of samples after which the step size is adjusted during
            the step size estimation pahse

        Returns
        -------
        samples : ndarray, shape = (n_samples, n_annotators)
            samples[i,:] is one sample from the posterior distribution over the
            parameters `theta`
        """

        self._raise_if_incompatible(annotations)
        nsamples = self._compute_total_nsamples(nsamples, burn_in_samples,
                                                thin_samples)

        # optimize step size
        counts = compute_counts(annotations, self.nclasses)

        # wrap log likelihood function to give it to optimize_step_size and
        # sample_distribution
        _llhood_counts = self._log_likelihood_counts
        _log_prior = self._log_prior

        def _wrap_llhood(params, counts):
            self.theta = params
            return _llhood_counts(counts) + _log_prior()

        # TODO this save-reset is rather ugly, refactor: create copy of
        #      model and sample over it
        # save internal parameters to reset at the end of sampling
        save_params = (self.gamma, self.theta)
        try:
            # compute optimal step size for given target rejection rate
            params_start = self.theta.copy()
            params_upper = np.ones((self.nannotators, ))
            params_lower = np.zeros((self.nannotators, ))
            step = optimize_step_size(_wrap_llhood, params_start, counts,
                                      params_lower, params_upper,
                                      step_optimization_nsamples,
                                      adjust_step_every, target_rejection_rate,
                                      rejection_rate_tolerance)

            # draw samples from posterior distribution over theta
            samples = sample_distribution(_wrap_llhood, params_start, counts,
                                          step, nsamples, params_lower,
                                          params_upper)
            return self._post_process_samples(samples, burn_in_samples,
                                              thin_samples)
        finally:
            # reset parameters
            self.gamma, self.theta = save_params
Exemplo n.º 4
0
    def sample_posterior_over_accuracy(self, annotations, nsamples,
                                       burn_in_samples = 100,
                                       thin_samples = 5,
                                       target_rejection_rate = 0.3,
                                       rejection_rate_tolerance = 0.2,
                                       step_optimization_nsamples = 500,
                                       adjust_step_every = 100):
        """Return samples from posterior distribution over theta given data.

        Samples are drawn using a variant of a Metropolis-Hasting Markov Chain
        Monte Carlo (MCMC) algorithm. Sampling proceeds in two phases:

            1) *step size estimation phase*: first, the step size in the
               MCMC algorithm is adjusted to achieve a given rejection rate.

            2) *sampling phase*: second, samples are collected using the
               step size from phase 1.

        Arguments
        ---------
        annotations : ndarray, shape = (n_items, n_annotators)
            annotations[i,j] is the annotation of annotator j for item i

        nsamples : int
            number of samples to draw from the posterior

        burn_in_samples : int
            Discard the first `burn_in_samples` during the initial burn-in
            phase, where the Monte Carlo chain converges to the posterior

        thin_samples : int
            Only return one every `thin_samples` samples in order to reduce
            the auto-correlation in the sampling chain. This is called
            "thinning" in MCMC parlance.

        target_rejection_rate : float
            target rejection rate for the step size estimation phase

        rejection_rate_tolerance : float
            the step size estimation phase is ended when the rejection rate for
            all parameters is within `rejection_rate_tolerance` from
            `target_rejection_rate`

        step_optimization_nsamples : int
            number of samples to draw in the step size estimation phase

        adjust_step_every : int
            number of samples after which the step size is adjusted during
            the step size estimation pahse

        Returns
        -------
        samples : ndarray, shape = (n_samples, n_annotators)
            samples[i,:] is one sample from the posterior distribution over the
            parameters `theta`
        """

        self._raise_if_incompatible(annotations)
        nsamples = self._compute_total_nsamples(nsamples,
                                                burn_in_samples,
                                                thin_samples)

        # optimize step size
        counts = compute_counts(annotations, self.nclasses)

        # wrap log likelihood function to give it to optimize_step_size and
        # sample_distribution
        _llhood_counts = self._log_likelihood_counts
        _log_prior = self._log_prior
        def _wrap_llhood(params, counts):
            self.theta = params
            return _llhood_counts(counts) + _log_prior()

        # TODO this save-reset is rather ugly, refactor: create copy of
        #      model and sample over it
        # save internal parameters to reset at the end of sampling
        save_params = self.theta
        try:
            # compute optimal step size for given target rejection rate
            params_start = self.theta.copy()
            params_upper = np.ones((self.nannotators,))
            params_lower = np.zeros((self.nannotators,))
            step = optimize_step_size(_wrap_llhood, params_start, counts,
                                params_lower, params_upper,
                                step_optimization_nsamples,
                                adjust_step_every,
                                target_rejection_rate,
                                rejection_rate_tolerance)

            # draw samples from posterior distribution over theta
            samples = sample_distribution(_wrap_llhood, params_start, counts,
                                          step, nsamples,
                                          params_lower, params_upper)

            return self._post_process_samples(samples, burn_in_samples,
                                              thin_samples)
        finally:
            # reset parameters
            self.theta = save_params