示例#1
0
    def log_predictive_prob(self, new_corpus, num_samples):

        D, V, T = self.D, self.V, self.T

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt_plus_alpha_m = self.Dt_plus_alpha_m
        D_plus_alpha = self.D_plus_alpha

        Nvt_new, Nt_new, Dt_new, z_new = [], [], [], []

        for r in xrange(num_samples):

            Nvt_new.append(zeros((T, V), dtype=int))
            Nt_new.append(zeros(T, dtype=int))

            Dt_new.append(zeros(T, dtype=int))

            z_new.append(zeros(len(new_corpus), dtype=int))

        log_p = 0

        for d, doc in enumerate(iterview(new_corpus)):

            tmp = zeros(num_samples, dtype=float)

            for r in xrange(num_samples):
                for prev_d in xrange(0, d):

                    prev_doc = corpus.documents[prev_d]
                    t = z_new[r][prev_d]

                    Nvt_new[r][t, :] -= prev_doc.Nv
                    Nt_new[r][t] -= len(prev_doc)
                    Dt_new[r][t] -= 1

                    t = log_sample(gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(prev_doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m))

                    Nvt_new[r][t, :] += prev_doc.Nv
                    Nt_new[r][t] += len(prev_doc)
                    Dt_new[r][t] += 1

                    z_new[r][prev_d] = t

                log_dist = gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m) - log(d + D_plus_alpha)

                tmp[r] = log_sum_exp(log_dist)

                t = log_sample(log_dist)

                Nvt_new[r][t, :] += doc.Nv
                Nt_new[r][t] += len(doc)
                Dt_new[r][t] += 1

                z_new[r][d] = t

            log_p += log_sum_exp(tmp) - log(num_samples)

        return log_p
示例#2
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over document--component assignments (i.e.,
        document groups) given this instance's corpus (i.e., document
        tokens). By default (i.e., if keyword argument 'init' is set
        to the value 'False') all document--component assignments (and
        corresponding counts) are assumed to have been initialized
        previously; otherwise, they are initialized.

        Keyword arguments:

        init -- whether to initialize document--component assignments
        """

        corpus = self.corpus

        T = self.T

        alpha_m = self.alpha_m

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt = self.Dt

        z = self.z

        for d, (doc, t) in enumerate(iterview(zip(corpus, z))):

            if not init:
                Nvt_plus_beta_n[t, :] -= doc.Nv
                Nt_plus_beta[t] -= len(doc)
                Dt[t] -= 1

            t = log_sample(
                gammaln(Nt_plus_beta)
                - gammaln(Nvt_plus_beta_n).sum(axis=1)
                + gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1)
                - gammaln(len(doc) * ones(T) + Nt_plus_beta)
                + log(Dt + alpha_m)
            )

            Nvt_plus_beta_n[t, :] += doc.Nv
            Nt_plus_beta[t] += len(doc)
            Dt[t] += 1

            z[d] = t
示例#3
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over document--component assignments (i.e.,
        document groups) given this instance's corpus (i.e., document
        tokens). By default (i.e., if keyword argument 'init' is set
        to the value 'False') all document--component assignments (and
        corresponding counts) are assumed to have been initialized
        previously; otherwise, they are initialized.

        Keyword arguments:

        init -- whether to initialize document--component assignments
        """

        corpus = self.corpus

        T = self.T

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt_plus_alpha_m = self.Dt_plus_alpha_m

        z = self.z

        for d, (doc, t) in enumerate(iterview(zip(corpus, z))):

            if not init:
                Nvt_plus_beta_n[t, :] -= doc.Nv
                Nt_plus_beta[t] -= len(doc)
                Dt_plus_alpha_m[t] -= 1

            t = log_sample(
                gammaln(Nt_plus_beta) - gammaln(Nvt_plus_beta_n).sum(axis=1) +
                gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1) -
                gammaln(len(doc) * ones(T) + Nt_plus_beta) +
                log(Dt_plus_alpha_m))

            Nvt_plus_beta_n[t, :] += doc.Nv
            Nt_plus_beta[t] += len(doc)
            Dt_plus_alpha_m[t] += 1

            z[d] = t
示例#4
0
    def log_predictive_prob(self, new_corpus, num_samples):

        D, V, T = self.D, self.V, self.T

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt_plus_alpha_m = self.Dt_plus_alpha_m
        D_plus_alpha = self.D_plus_alpha

        Nvt_new, Nt_new, Dt_new, z_new = [], [], [], []

        for r in xrange(num_samples):

            Nvt_new.append(zeros((T, V), dtype=int))
            Nt_new.append(zeros(T, dtype=int))

            Dt_new.append(zeros(T, dtype=int))

            z_new.append(zeros(len(new_corpus), dtype=int))

        log_p = 0

        for d, doc in enumerate(iterview(new_corpus)):

            tmp = zeros(num_samples, dtype=float)

            for r in xrange(num_samples):
                for prev_d in xrange(0, d):

                    prev_doc = corpus.documents[prev_d]
                    t = z_new[r][prev_d]

                    Nvt_new[r][t, :] -= prev_doc.Nv
                    Nt_new[r][t] -= len(prev_doc)
                    Dt_new[r][t] -= 1

                    t = log_sample(
                        gammaln(Nt_new[r] + Nt_plus_beta) -
                        gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) +
                        gammaln(
                            tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] +
                            Nvt_plus_beta_n).sum(axis=1) - gammaln(
                                len(prev_doc) * ones(T) + Nt_new[r] +
                                Nt_plus_beta) +
                        log(Dt_new[r] + Dt_plus_alpha_m))

                    Nvt_new[r][t, :] += prev_doc.Nv
                    Nt_new[r][t] += len(prev_doc)
                    Dt_new[r][t] += 1

                    z_new[r][prev_d] = t

                pass  # YOUR CODE GOES HERE

                Nvt_new[r][t, :] += doc.Nv
                Nt_new[r][t] += len(doc)
                Dt_new[r][t] += 1

                z_new[r][d] = t

            log_p += log_sum_exp(tmp) - log(num_samples)

        return log_p