Exemplo n.º 1
0
 def _approximate_dataprob_correction(self, sample_size):
     '''
     ad hoc approximation,
     see `python derivations/clustering.py dataprob`
     see `python derivations/clustering.py approximations`
     '''
     n = log(sample_size)
     N = log(self.dataset_size)
     return 0.061 * n * (n - N) * (n + N) ** 0.75
Exemplo n.º 2
0
 def score_data(self, shared):
     """computes the joint p(q, Y)"""
     prior = sp.stats.beta.logpdf(self.p, shared.alpha, shared.beta)
     if self.p >= 0. and self.p <= 1.:
         likelihood = self.heads * \
             log(self.p) + self.tails * log(1. - self.p)
     else:
         likelihood = -np.inf
     return prior + likelihood
Exemplo n.º 3
0
 def score_data(self, shared):
     """computes the joint p(q, Y)"""
     prior = sp.stats.beta.logpdf(self.p, shared.alpha, shared.beta)
     if self.p >= 0. and self.p <= 1.:
         likelihood = self.heads * \
             log(self.p) + self.tails * log(1. - self.p)
     else:
         likelihood = -np.inf
     return prior + likelihood
 def _approximate_dataprob_correction(self, sample_size):
     '''
     ad hoc approximation,
     see `python derivations/clustering.py dataprob`
     see `python derivations/clustering.py approximations`
     '''
     n = log(sample_size)
     N = log(self.dataset_size)
     return 0.061 * n * (n - N) * (n + N) ** 0.75
Exemplo n.º 5
0
 def score_group(self, group):
     """
     \cite{murphy2007conjugate}, Eq. 171
     """
     post = self.plus_group(group)
     return gammaln(post.nu / 2.) - gammaln(self.nu / 2.) \
         + 0.5 * log(self.kappa / post.kappa) \
         + (0.5 * self.nu) * log(self.nu * self.sigmasq) \
         - (0.5 * post.nu) * log(post.nu * post.sigmasq) \
         - group.count / 2. * 1.1447298858493991
Exemplo n.º 6
0
 def score_data(self, shared):
     """
     \cite{murphy2007conjugate}, Eq. 171
     """
     post = shared.plus_group(self)
     return gammaln(post.nu / 2.) - gammaln(shared.nu / 2.) \
         + 0.5 * log(shared.kappa / post.kappa) \
         + (0.5 * shared.nu) * log(shared.nu * shared.sigmasq) \
         - (0.5 * post.nu) * log(post.nu * post.sigmasq) \
         - self.count / 2. * 1.1447298858493991
Exemplo n.º 7
0
def score_student_t(x, nu, mu, sigmasq):
    """
    \cite{murphy2007conjugate}, Eq. 304
    """
    score = gammaln(.5 * (nu + 1.)) - gammaln(.5 * nu)
    score -= .5 * log(nu * pi * sigmasq)
    xt = (x - mu)
    s = xt * xt / sigmasq
    score += -(.5 * (nu + 1.)) * log(1. + s / nu)
    return score
Exemplo n.º 8
0
 def score_data(self, shared):
     """
     \cite{murphy2007conjugate}, Eq. 171
     """
     post = shared.plus_group(self)
     return gammaln(post.nu / 2.) - gammaln(shared.nu / 2.) \
         + 0.5 * log(shared.kappa / post.kappa) \
         + (0.5 * shared.nu) * log(shared.nu * shared.sigmasq) \
         - (0.5 * post.nu) * log(post.nu * post.sigmasq) \
         - self.count / 2. * 1.1447298858493991
Exemplo n.º 9
0
def score_student_t(x, nu, mu, sigmasq):
    """
    \cite{murphy2007conjugate}, Eq. 304
    """
    score = gammaln(.5 * (nu + 1.)) - gammaln(.5 * nu)
    score -= .5 * log(nu * pi * sigmasq)
    xt = (x - mu)
    s = xt * xt / sigmasq
    score += -(.5 * (nu + 1.)) * log(1. + s / nu)
    return score
Exemplo n.º 10
0
 def score_value(self, shared, value):
     """
     \cite{wallach2009rethinking} Eqn 4.
     McCallum, et. al, 'Rething LDA: Why Priors Matter'
     """
     numer = self.counts[value] + shared.alphas[value]
     denom = self.counts.sum() + shared.alphas.sum()
     return log(numer / denom)
Exemplo n.º 11
0
 def score_value(self, shared, value):
     """
     \cite{wallach2009rethinking} Eqn 4.
     McCallum, et. al, 'Rething LDA: Why Priors Matter'
     """
     numer = self.counts[value] + shared.alphas[value]
     denom = self.counts.sum() + shared.alphas.sum()
     return log(numer / denom)
Exemplo n.º 12
0
 def score_value(self, shared, value):
     """
     \cite{wallach2009rethinking} Eqn 4.
     McCallum, et. al, 'Rething LDA: Why Priors Matter'
     """
     heads = shared.alpha + self.heads
     tails = shared.beta + self.tails
     numer = heads if value else tails
     denom = heads + tails
     return log(numer / denom)
Exemplo n.º 13
0
 def score_value(self, shared, value):
     """
     \cite{wallach2009rethinking} Eqn 4.
     McCallum, et. al, 'Rething LDA: Why Priors Matter'
     """
     heads = shared.alpha + self.heads
     tails = shared.beta + self.tails
     numer = heads if value else tails
     denom = heads + tails
     return log(numer / denom)
Exemplo n.º 14
0
 def score_value(self, group, value):
     """
     Adapted from dd.py, which was adapted from:
     McCallum, et. al, 'Rethinking LDA: Why Priors Matter' eqn 4
     """
     denom = self.alpha + group.total
     if value == OTHER:
         numer = self.beta0 * self.alpha
     else:
         numer = self.betas[value] * self.alpha + group.counts.get(value, 0)
     return log(numer / denom)
Exemplo n.º 15
0
    def score_add_value(
            self,
            group_size,
            nonempty_group_count,
            sample_size,
            empty_group_count=1):
        '''
        Return log of posterior predictive probability given
        sufficient statistics of a partial assignments vector [X_0,...,X_{n-1}]

            log P[ X_n = k | X_0=x_0, ..., X_{n-1}=x_{n-1} ]

        where

            group_size = #{i | x_i = k, i in {0,...,n-1}}

            nonempty_group_count = #{x_i | i in {0,...,n-1}}

            sample_size = n

        and empty_group_count is the number of empty groups that are uniformly
        competing for the assignment.  Typically empty_group_count = 1, but
        multiple empty "ephemeral" groups are used in e.g. Radford Neal's
        Algorithm-8 \cite{neal2000markov}.
        '''
        assert sample_size < self.dataset_size
        assert 0 < empty_group_count

        if group_size == 0:
            score = -log(empty_group_count)
            if sample_size + 1 < self.dataset_size:
                score += self._approximate_postpred_correction(sample_size + 1)
            return score

        # see `python derivations/clustering.py fastlog`
        very_large = 10000
        bigger = 1.0 + group_size
        if group_size > very_large:
            return 1.0 + log(bigger)
        else:
            return log(bigger / group_size) * group_size + log(bigger)
Exemplo n.º 16
0
    def score_add_value(
            self,
            group_size,
            nonempty_group_count,
            sample_size,
            empty_group_count=1):
        '''
        Return log of posterior predictive probability given
        sufficient statistics of a partial assignments vector [X_0,...,X_{n-1}]

            log P[ X_n = k | X_0=x_0, ..., X_{n-1}=x_{n-1} ]

        where

            group_size = #{i | x_i = k, i in {0,...,n-1}}

            nonempty_group_count = #{x_i | i in {0,...,n-1}}

            sample_size = n

        and empty_group_count is the number of empty groups that are uniformly
        competing for the assignment.  Typically empty_group_count = 1, but
        multiple empty "ephemeral" groups are used in e.g. Radford Neal's
        Algorithm-8 \cite{neal2000markov}.
        '''
        assert sample_size < self.dataset_size
        assert 0 < empty_group_count

        if group_size == 0:
            score = -log(empty_group_count)
            if sample_size + 1 < self.dataset_size:
                score += self._approximate_postpred_correction(sample_size + 1)
            return score

        # see `python derivations/clustering.py fastlog`
        very_large = 10000
        bigger = 1.0 + group_size
        if group_size > very_large:
            return 1.0 + log(bigger)
        else:
            return log(bigger / group_size) * group_size + log(bigger)
Exemplo n.º 17
0
    def _approximate_postpred_correction(self, sample_size):
        '''
        ad hoc approximation,
        see `python derivations/clustering.py postpred`
        see `python derivations/clustering.py approximations`
        '''
        assert 0 < sample_size
        assert sample_size < self.dataset_size

        exponent = 0.45 - 0.1 / sample_size - 0.1 / self.dataset_size
        scale = self.dataset_size / sample_size
        return log(scale) * exponent
Exemplo n.º 18
0
    def _approximate_postpred_correction(self, sample_size):
        '''
        ad hoc approximation,
        see `python derivations/clustering.py postpred`
        see `python derivations/clustering.py approximations`
        '''
        assert 0 < sample_size
        assert sample_size < self.dataset_size

        exponent = 0.45 - 0.1 / sample_size - 0.1 / self.dataset_size
        scale = self.dataset_size / sample_size
        return log(scale) * exponent
Exemplo n.º 19
0
 def score_value(self, shared, value):
     """
     Adapted from dd.py, which was adapted from:
     McCallum, et. al, 'Rethinking LDA: Why Priors Matter' eqn 4
     """
     denom = shared.alpha + self.total
     if value == OTHER:
         numer = shared.beta0 * shared.alpha
     else:
         count = self.counts.get(value, 0)
         assert count >= 0, "cannot score while in debt"
         numer = shared.betas[value] * shared.alpha + count
     return log(numer / denom)
Exemplo n.º 20
0
 def score_value(self, shared, value):
     """
     Adapted from dd.py, which was adapted from:
     McCallum, et. al, 'Rethinking LDA: Why Priors Matter' eqn 4
     """
     denom = shared.alpha + self.total
     if value == OTHER:
         numer = shared.beta0 * shared.alpha
     else:
         count = self.counts.get(value, 0)
         assert count >= 0, "cannot score while in debt"
         numer = shared.betas[value] * shared.alpha + count
     return log(numer / denom)
Exemplo n.º 21
0
    def log_partition_function(self, sample_size):
        '''
        Computes

            log_sum_exp(
                sum(n * log(n) for n in partition)
                for partition in partitions(sample_size)
            )

        exactly for small n, and approximately for large n.
        '''
        # TODO incorporate dataset_size for higher accuracy
        n = sample_size
        if n < 48:
            return LowEntropy.log_partition_function_table[n]
        else:
            coeff = 0.28269584
            log_z_max = n * log(n)
            return log_z_max * (1.0 + coeff * n ** -0.75)
Exemplo n.º 22
0
    def log_partition_function(self, sample_size):
        '''
        Computes

            log_sum_exp(
                sum(n * log(n) for n in partition)
                for partition in partitions(sample_size)
            )

        exactly for small n, and approximately for large n.
        '''
        # TODO incorporate dataset_size for higher accuracy
        n = sample_size
        if n < 48:
            return LowEntropy.log_partition_function_table[n]
        else:
            coeff = 0.28269584
            log_z_max = n * log(n)
            return log_z_max * (1.0 + coeff * n ** -0.75)
Exemplo n.º 23
0
    def score_counts(self, counts):
        '''
        Return log probability of data, given sufficient statistics of
        a partial assignment vector [X_0,...,X_{n-1}]

            log P[ X_0=x_0, ..., X_{n-1}=x_{n-1} ]
        '''
        score = 0.0
        sample_size = 0
        for count in counts:
            sample_size += count
            if count > 1:
                score += count * log(count)
        assert sample_size <= self.dataset_size

        if sample_size != self.dataset_size:
            log_factor = self._approximate_postpred_correction(sample_size)
            score += log_factor * (len(counts) - 1)
            score += self._approximate_dataprob_correction(sample_size)
        score -= self.log_partition_function(sample_size)
        return score
Exemplo n.º 24
0
    def score_counts(self, counts):
        '''
        Return log probability of data, given sufficient statistics of
        a partial assignment vector [X_0,...,X_{n-1}]

            log P[ X_0=x_0, ..., X_{n-1}=x_{n-1} ]
        '''
        score = 0.0
        sample_size = 0
        for count in counts:
            sample_size += count
            if count > 1:
                score += count * log(count)
        assert sample_size <= self.dataset_size

        if sample_size != self.dataset_size:
            log_factor = self._approximate_postpred_correction(sample_size)
            score += log_factor * (len(counts) - 1)
            score += self._approximate_dataprob_correction(sample_size)
        score -= self.log_partition_function(sample_size)
        return score
Exemplo n.º 25
0
 def score_value(self, shared, value):
     """samples a value using the explicit p"""
     return log(self.p) if value else log(1. - self.p)
Exemplo n.º 26
0
 def remove_value(self, shared, value):
     self.count -= 1
     self.sum -= int(value)
     self.log_prod -= log(factorial(value))
Exemplo n.º 27
0
 def add_repeated_value(self, shared, value, count):
     self.count += count
     self.sum += int(count * value)
     self.log_prod += count * log(factorial(value))
Exemplo n.º 28
0
 def add_value(self, shared, value):
     self.count += 1
     self.sum += int(value)
     self.log_prod += log(factorial(value))
Exemplo n.º 29
0
 def score_data(self, shared):
     post = shared.plus_group(self)
     return gammaln(post.alpha) - gammaln(shared.alpha) \
         - post.alpha * log(post.inv_beta) \
         + shared.alpha * log(shared.inv_beta) \
         - self.log_prod
Exemplo n.º 30
0
 def score_data(self, shared):
     post = shared.plus_group(self)
     return gammaln(post.alpha) - gammaln(shared.alpha) \
         - post.alpha * log(post.inv_beta) \
         + shared.alpha * log(shared.inv_beta) \
         - self.log_prod
Exemplo n.º 31
0
 def remove_value(self, shared, value):
     self.count -= 1
     self.sum -= int(value)
     self.log_prod -= log(factorial(value))
Exemplo n.º 32
0
 def add_repeated_value(self, shared, value, count):
     self.count += count
     self.sum += int(count * value)
     self.log_prod += count * log(factorial(value))
Exemplo n.º 33
0
 def add_value(self, shared, value):
     self.count += 1
     self.sum += int(value)
     self.log_prod += log(factorial(value))
Exemplo n.º 34
0
 def score_value(self, shared, value):
     post = shared.plus_group(self)
     return gammaln(post.alpha + value) - gammaln(post.alpha) \
         + post.alpha * log(post.inv_beta) \
         - (post.alpha + value) * log(1. + post.inv_beta) \
         - log(factorial(value))
Exemplo n.º 35
0
 def score_value(self, shared, value):
     """samples a value using the explicit p"""
     return log(self.p) if value else log(1. - self.p)
Exemplo n.º 36
0
 def score_value(self, shared, value):
     post = shared.plus_group(self)
     return gammaln(post.alpha + value) - gammaln(post.alpha) \
         + post.alpha * log(post.inv_beta) \
         - (post.alpha + value) * log(1. + post.inv_beta) \
         - log(factorial(value))